library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[30m── [1mAttaching packages[22m ──────────────────────────────────────── tidyverse 1.3.0 ──[39m
[30m[32m✓[30m [34mggplot2[30m 3.3.3 [32m✓[30m [34mpurrr [30m 0.3.4
[32m✓[30m [34mtibble [30m 3.1.1 [32m✓[30m [34mdplyr [30m 1.0.5
[32m✓[30m [34mtidyr [30m 1.1.3 [32m✓[30m [34mstringr[30m 1.4.0
[32m✓[30m [34mreadr [30m 1.4.0 [32m✓[30m [34mforcats[30m 0.5.0[39m
[30m── [1mConflicts[22m ─────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library(phyloseq)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
library(phangorn)
Loading required package: ape
library(readr)
library(ape)
library(vegan)
Loading required package: permute
Loading required package: lattice
This is vegan 2.5-7
Attaching package: ‘vegan’
The following objects are masked from ‘package:phangorn’:
diversity, treedist
library(RColorBrewer)
library(microbiome)
microbiome R package (microbiome.github.com)
Copyright (C) 2011-2020 Leo Lahti,
Sudarshan Shetty et al. <microbiome.github.io>
Attaching package: ‘microbiome’
The following object is masked from ‘package:vegan’:
diversity
The following object is masked from ‘package:phangorn’:
diversity
The following object is masked from ‘package:ggplot2’:
alpha
The following object is masked from ‘package:base’:
transform
library(compositions)
Welcome to compositions, a package for compositional data analysis.
Find an intro with "? compositions"
Attaching package: ‘compositions’
The following object is masked from ‘package:ape’:
balance
The following objects are masked from ‘package:stats’:
cor, cov, dist, var
The following objects are masked from ‘package:base’:
%*%, norm, scale, scale.default
library(SpiecEasi)
Attaching package: ‘SpiecEasi’
The following objects are masked from ‘package:compositions’:
alr, clr
library(otuSummary)
library(psych)
Attaching package: ‘psych’
The following objects are masked from ‘package:SpiecEasi’:
cor2cov, shannon
The following objects are masked from ‘package:compositions’:
ellipses, pairwisePlot
The following object is masked from ‘package:microbiome’:
alpha
The following objects are masked from ‘package:ggplot2’:
%+%, alpha
library(Matrix)
Attaching package: ‘Matrix’
The following objects are masked from ‘package:SpiecEasi’:
tril, triu
The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
library(igraph)
Attaching package: ‘igraph’
The following object is masked from ‘package:SpiecEasi’:
make_graph
The following object is masked from ‘package:compositions’:
normalize
The following object is masked from ‘package:microbiome’:
diversity
The following object is masked from ‘package:vegan’:
diversity
The following object is masked from ‘package:permute’:
permute
The following object is masked from ‘package:phangorn’:
diversity
The following objects are masked from ‘package:ape’:
edges, mst, ring
The following objects are masked from ‘package:dplyr’:
as_data_frame, groups, union
The following objects are masked from ‘package:purrr’:
compose, simplify
The following object is masked from ‘package:tidyr’:
crossing
The following object is masked from ‘package:tibble’:
as_data_frame
The following objects are masked from ‘package:stats’:
decompose, spectrum
The following object is masked from ‘package:base’:
union
library(plotly)
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following object is masked from ‘package:igraph’:
groups
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
library(egg)
Loading required package: gridExtra
Attaching package: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
# Helper functions from J. Cram https://biovcnet.github.io/_pages/NetworkScience_SparCC.nb
pass <- function(x){x}
# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}
reorder_cor_and_p <- function(cormat, pmat){
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
pmat <- pmat[hc$order, hc$order]
list(r = cormat, p = pmat)
}
#Custom colorblind pallette, see: https://stackoverflow.com/questions/57153428/r-plot-color-combinations-that-are-colorblind-accessible
customvermillion<-rgb(213/255,94/255,0/255)
custombluegreen<-rgb(0/255,158/255,115/255)
customblue<-rgb(0/255,114/255,178/255)
customskyblue<-rgb(86/255,180/255,233/255)
customreddishpurple<-rgb(204/255,121/255,167/255)
# Report versions of packages
sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.7
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] egg_0.4.5 gridExtra_2.3 plotly_4.9.2.2 igraph_1.2.6
[5] Matrix_1.3-0 psych_2.1.3 otuSummary_0.1.1 SpiecEasi_1.1.1
[9] compositions_2.0-0 microbiome_1.10.0 RColorBrewer_1.1-2 vegan_2.5-7
[13] lattice_0.20-41 permute_0.9-5 phangorn_2.5.5 ape_5.4-1
[17] phyloseq_1.32.0 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.5
[21] purrr_0.3.4 readr_1.4.0 tidyr_1.1.3 tibble_3.1.1
[25] ggplot2_3.3.3 tidyverse_1.3.0
loaded via a namespace (and not attached):
[1] Rtsne_0.15 VGAM_1.1-5 colorspace_2.0-0
[4] ellipsis_0.3.1 XVector_0.28.0 fs_1.5.0
[7] rstudioapi_0.13 fansi_0.4.2 lubridate_1.7.9.2
[10] xml2_1.3.2 codetools_0.2-18 splines_4.0.2
[13] mnormt_2.0.2 robustbase_0.93-6 knitr_1.30
[16] ade4_1.7-16 jsonlite_1.7.2 broom_0.7.3
[19] cluster_2.1.0 dbplyr_2.0.0 compiler_4.0.2
[22] httr_1.4.2 backports_1.2.1 assertthat_0.2.1
[25] lazyeval_0.2.2 cli_2.4.0 htmltools_0.5.0
[28] prettyunits_1.1.1 tools_4.0.2 gtable_0.3.0
[31] glue_1.4.2 reshape2_1.4.4 fastmatch_1.1-0
[34] Rcpp_1.0.6 Biobase_2.48.0 cellranger_1.1.0
[37] vctrs_0.3.7 Biostrings_2.56.0 multtest_2.44.0
[40] nlme_3.1-151 iterators_1.0.13 tensorA_0.36.2
[43] xfun_0.19 rvest_0.3.6 lifecycle_1.0.0
[46] DEoptimR_1.0-8 zlibbioc_1.34.0 MASS_7.3-53
[49] scales_1.1.1 hms_0.5.3 parallel_4.0.2
[52] biomformat_1.16.0 rhdf5_2.32.4 huge_1.3.4.1
[55] stringi_1.5.3 S4Vectors_0.26.1 foreach_1.5.1
[58] BiocGenerics_0.34.0 shape_1.4.6 rlang_0.4.10
[61] pkgconfig_2.0.3 Rhdf5lib_1.10.1 htmlwidgets_1.5.3
[64] tidyselect_1.1.0 plyr_1.8.6 magrittr_2.0.1
[67] R6_2.5.0 IRanges_2.22.2 generics_0.1.0
[70] DBI_1.1.0 pillar_1.6.0 haven_2.3.1
[73] withr_2.4.2 mgcv_1.8-33 survival_3.2-7
[76] bayesm_3.1-4 modelr_0.1.8 pulsar_0.3.7
[79] crayon_1.4.1 utf8_1.2.1 tmvnsim_1.0-2
[82] progress_1.2.2 grid_4.0.2 readxl_1.3.1
[85] data.table_1.13.4 reprex_0.3.0 digest_0.6.27
[88] stats4_4.0.2 munsell_0.5.0 glmnet_4.1-1
[91] viridisLite_0.4.0 quadprog_1.5-8
Metadata:
metadata <- read_csv("Metadata.csv")
[36m──[39m [1m[1mColumn specification[1m[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
.default = col_double(),
`Sample Name` = [31mcol_character()[39m,
Replicate = [31mcol_character()[39m,
Type = [31mcol_character()[39m,
SizeFraction = [31mcol_character()[39m,
Season = [31mcol_character()[39m,
OxCond = [31mcol_character()[39m
)
[36mℹ[39m Use [38;5;235m[48;5;253m[38;5;235m[48;5;253m`spec()`[48;5;253m[38;5;235m[49m[39m for the full column specifications.
Import SRA table and match SRA IDs with sample IDs in metadata file
SRARunTable <- read_csv("sra_data/SraRunTable.txt")
[36m──[39m [1m[1mColumn specification[1m[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
.default = col_character(),
AvgSpotLen = [32mcol_double()[39m,
Bases = [32mcol_double()[39m,
Bytes = [32mcol_double()[39m,
ReleaseDate = [34mcol_datetime(format = "")[39m,
Depth_m = [32mcol_double()[39m,
CH4_uM = [32mcol_double()[39m,
H2S_Um = [32mcol_double()[39m,
Oxygen_uM = [32mcol_double()[39m,
Particulate_Sulfur_uM = [32mcol_double()[39m,
salinity = [32mcol_double()[39m,
Temperature_degree_C = [32mcol_double()[39m,
TZVS_uM = [32mcol_double()[39m
)
[36mℹ[39m Use [38;5;235m[48;5;253m[38;5;235m[48;5;253m`spec()`[48;5;253m[38;5;235m[49m[39m for the full column specifications.
metadata <- left_join(metadata, SRARunTable, by = 'Sample Name')
DADA2 results:
# Import Count table. Skip first row of tsv file, which is just some text
count_table <- read_tsv(file="dada2_export/ASVs_counts.tsv")
Missing column names filled in: 'X1' [1]
[36m──[39m [1m[1mColumn specification[1m[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
.default = col_double(),
X1 = [31mcol_character()[39m
)
[36mℹ[39m Use [38;5;235m[48;5;253m[38;5;235m[48;5;253m`spec()`[48;5;253m[38;5;235m[49m[39m for the full column specifications.
# And specify that the first column of data are rownames
count_table <- column_to_rownames(count_table, var = colnames(count_table)[1])
# Import taxonomy of ASVs
taxonomy <- read_tsv(file="dada2_export/ASVs_taxonomy.tsv")
Missing column names filled in: 'X1' [1]
[36m──[39m [1m[1mColumn specification[1m[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
X1 = [31mcol_character()[39m,
Kingdom = [31mcol_character()[39m,
Supergroup = [31mcol_character()[39m,
Division = [31mcol_character()[39m,
Class = [31mcol_character()[39m,
Order = [31mcol_character()[39m,
Family = [31mcol_character()[39m,
Genus = [31mcol_character()[39m,
Species = [31mcol_character()[39m
)
# And specify that the first column of data are rownames
taxonomy <- column_to_rownames(taxonomy, var = colnames(taxonomy)[1])
# Use rarecurve, from the Vegan package. Rarcurve expects the dataset as a dataframe so we need to use as.data.frame again:
count_table_df <- as.data.frame(count_table)
# Plot the rarefaction curves, color-coding by the colors listed in sample_info_tab, which indicate sample type, and transforming using t() again
# Running this 5-10 samples at a time because otherwise it takes a long time to render
rarecurve(t(count_table_df), step=100, cex=0.5, ylab="ASVs", label=T)
count_table_no_singletons <- filter(count_table,rowSums(count_table)>1)
# retains all ASVs (out of 14176)
and change sample names from NCBI ID to our internal sample IDs
# Modify taxa names in count_table_no_singletons, which are the NCBI SRA numbers. Want to use our internal sample key
key <- SRARunTable %>% select(Run, 'Sample Name')
x <- (t(count_table_no_singletons))
x <- as.data.frame(cbind(x, Run = rownames(x)))
y <- t(left_join(x, key, by = "Run"))
colnames(y) <- y['Sample Name',]
y <- y[ !(rownames(y) %in% c('Sample Name', 'Run')), ]
count_table_2 <- type_convert(as.data.frame(y))
[36m──[39m [1m[1mColumn specification[1m[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
.default = col_double()
)
[36mℹ[39m Use [38;5;235m[48;5;253m[38;5;235m[48;5;253m`spec()`[48;5;253m[38;5;235m[49m[39m for the full column specifications.
This process takes a LONG time so run once and save .RData object In the Dada2 tools, there are no options to build a tree (unlike in Qiime2) but we can build it here using DECIPHER and phangorn
(Based on https://f1000research.com/articles/5-1492/v2)
Make an alignment using tools from Decipher (Note- alignment step takes several hours. Commented out for now. Only need to run once)
## import fasta
# fas <- "dada2_export/ASVs.fa"
# seqs <- readDNAStringSet(fas)
# seqs
#
# # perform the alignment
# aligned <- AlignSeqs(seqs) # automatically detects and uses all cores
#
# # view the alignment in a browser (optional)
# BrowseSeqs(aligned, highlight=0)
#
# # write out aligned sequence file
# writeXStringSet(aligned, file="ASVs.aligned.fasta")
Use phangorn package to build tree. Here we are building a maximum likelihood neighbor-joining tree. (Also takes a while to run. Comment out for now.)
# phang.align <- phyDat(as(aligned, "matrix"), type="DNA") # convert to phyDat format
# dm <- dist.ml(phang.align) # calculate pairwise distance matrix
# treeNJ <- NJ(dm) # perform neighbor-joining tree method
# fit = pml(treeNJ, data=phang.align) # compute intermal max likelihood
Since the step above takes a long time, save all variables up to this point in environment as RData object
save.image("EnvironmentBackups/CariacoEuks_postanalysis_vars_upto_tree.RData")
Re-load
load("EnvironmentBackups/CariacoEuks_postanalysis_vars_upto_tree.RData")
Here we will do ordinations using the phyloseq package, which first requires making phyloseq objects out of each of our input data tables (in the last tutorial, I imported the tree using phyloseq so it is already a phyloseq object)
ASV = otu_table(count_table_2, taxa_are_rows = TRUE)
There were 15 warnings (use warnings() to see them)
TAX = tax_table(as.matrix(taxonomy))
META = sample_data(data.frame(metadata, row.names = metadata$`Sample Name`))
TREE = phy_tree(fit$tree)
First check that the inputs are in compatible formats by checking for ASV names with the phyloseq function, taxa_names
head(taxa_names(TAX))
[1] "ASV_1" "ASV_2" "ASV_3" "ASV_4" "ASV_5" "ASV_6"
head(taxa_names(ASV))
[1] "ASV_1" "ASV_2" "ASV_3" "ASV_4" "ASV_5" "ASV_6"
head(taxa_names(TREE))
[1] "ASV_1" "ASV_2" "ASV_3" "ASV_4" "ASV_5" "ASV_6"
And check sample names were also detected
head(sample_names(ASV))
[1] "AE3a103A" "AE3b103A" "AE1b900AM" "AE3a103B" "AE3b103B" "AE3a198B"
head(sample_names(META))
[1] "AE3a103A" "AE3b103A" "AE3a198A" "AE3b198A" "AE3a234A" "AE3b234A"
And make the phyloseq object
ps <- phyloseq(ASV, TAX, META , TREE)
Check some features of the phyloseq object
rank_names(ps)
[1] "Kingdom" "Supergroup" "Division" "Class" "Order" "Family" "Genus"
[8] "Species"
table(tax_table(ps)[, "Supergroup"], exclude = NULL)
Alveolata Amoebozoa Apusozoa Archaeplastida Excavata Hacrobia
8880 9 45 108 9 395
Opisthokonta Rhizaria Stramenopiles <NA>
768 2405 1086 471
unique(tax_table(ps)[, "Supergroup"])
Taxonomy Table: [10 taxa by 1 taxonomic ranks]:
Supergroup
ASV_1 "Alveolata"
ASV_2 "Rhizaria"
ASV_6 "Stramenopiles"
ASV_18 "Opisthokonta"
ASV_78 "Hacrobia"
ASV_148 "Archaeplastida"
ASV_193 NA
ASV_557 "Apusozoa"
ASV_1114 "Amoebozoa"
ASV_2665 "Excavata"
Filter out those ambigious Supergroup annotations- losing 471 ASVs
ps <- subset_taxa(ps, !is.na(Supergroup) & !Supergroup %in% c("", "NA"))
table(tax_table(ps)[, "Supergroup"], exclude = NULL)
Alveolata Amoebozoa Apusozoa Archaeplastida Excavata Hacrobia
8880 9 45 108 9 395
Opisthokonta Rhizaria Stramenopiles
768 2405 1086
Check out the Division names
table(tax_table(ps)[, "Division"], exclude = NULL)
Apicomplexa Apusomonadidae Centroheliozoa Cercozoa
29 26 40 246
Chlorophyta Choanoflagellida Ciliophora Cryptophyta
64 54 407 50
Dinoflagellata Discoba Foraminifera Fungi
8330 1 2 57
Haptophyta Hilomonadea Katablepharidophyta Lobosa
215 17 2 9
Mesomycetozoa Metamonada Metazoa Ochrophyta
17 8 561 453
Opalozoa Opisthokonta_X Perkinsea Picozoa
216 14 5 61
Pseudofungi Radiolaria Rhodophyta Sagenista
72 2155 4 186
Stramenopiles_X Streptophyta Telonemia <NA>
61 38 27 278
Filter out any with “NA” as Division
ps <- subset_taxa(ps, !is.na(Division) & !Division %in% c(""))
table(tax_table(ps)[, "Division"], exclude = NULL)
Apicomplexa Apusomonadidae Centroheliozoa Cercozoa
29 26 40 246
Chlorophyta Choanoflagellida Ciliophora Cryptophyta
64 54 407 50
Dinoflagellata Discoba Foraminifera Fungi
8330 1 2 57
Haptophyta Hilomonadea Katablepharidophyta Lobosa
215 17 2 9
Mesomycetozoa Metamonada Metazoa Ochrophyta
17 8 561 453
Opalozoa Opisthokonta_X Perkinsea Picozoa
216 14 5 61
Pseudofungi Radiolaria Rhodophyta Sagenista
72 2155 4 186
Stramenopiles_X Streptophyta Telonemia
61 38 27
After the above, 13,427 ASVs remain from the original 14,177
Eliminate the libraries that didn’t have many sequences, AE3a198A, AE3b314A, AE2a200A, AE2b900AN, AE2a200B, AE2a267B, AE2a900BN
taxa_to_keep <- !sample_names(ps) %in% c("AE3a198A","AE3b314A","AE2a200A","AE2b900AN","AE2a200B","AE2a267B","AE2a900BN")
ps <- prune_samples(taxa_to_keep, ps)
41 samples remain and stil 13,427 ASVs
Check rarefaction curve again to make sure those low-sqeuencing-effort samples have been removed
rarecurve(t(otu_table(ps)), step=100, cex=0.5, ylab="ASVs", label=T)
Have to do this because you may have removed the root of your tree when pruning). (I found this handy function from here which picks the longest branch to root from).
There were 15 warnings (use warnings() to see them)
# first define function from link above to find furthest outgroup
pick_new_outgroup <- function(tree.unrooted){
require("magrittr")
require("data.table")
require("ape") # ape::Ntip
# tablify parts of tree that we need.
treeDT <-
cbind(
data.table(tree.unrooted$edge),
data.table(length = tree.unrooted$edge.length)
)[1:Ntip(tree.unrooted)] %>%
cbind(data.table(id = tree.unrooted$tip.label))
# Take the longest terminal branch as outgroup
new.outgroup <- treeDT[which.max(length)]$id
return(new.outgroup) }
# then run on my phyloseq tree
my.tree <- phy_tree(ps)
out.group <- pick_new_outgroup(my.tree)
out.group
[1] "ASV_10740"
# Then use this outgroup to root the tree
new.tree1 <- ape::root(my.tree, outgroup=out.group, resolve.root=TRUE)
phy_tree(ps) <- new.tree1
# Check if tree is binary (dichotomous not multichotomous)
is.binary.tree(phy_tree(ps))
[1] TRUE
# If false, would have to run
# new.tree2 <- ape::multi2di(new.tree1)
# phy_tree(ps) <- new.tree2
# phy_tree(ps)
Check overall how the phyla are distributed among samples. Phyloseq makes this easy
# First aglomerate the ASVs at the phylum level using the phyloseq function, tax_glom
DivisionGlommed = tax_glom(ps, "Division")
# There are many phyla here, so have to make a custom color palette by interpolating from an existing one in RColorBrewer
colourCount = length(table(tax_table(ps)[, "Division"], exclude = NULL))
getPalette = colorRampPalette(brewer.pal(11, "Spectral"))
DivisionPalette = getPalette(colourCount)
# and plot
plot_bar(DivisionGlommed, x = "Sample", fill = "Division") +
scale_fill_manual(values = DivisionPalette)
Plot compositional (relative abundances) instead of absolute abundance using microbiome::transform
ps_ra <- microbiome::transform(ps, transform = "compositional")
(otu_table(ps_ra))[1:5,1:5]
OTU Table: [5 taxa and 5 samples]
taxa are rows
AE3a103A AE3b103A AE1b900AM AE3a103B AE3b103B
ASV_1 4.046390e-04 0.000105531 2.462054e-05 0.000000e+00 2.400346e-05
ASV_2 0.000000e+00 0.000000000 3.132963e-02 0.000000e+00 5.600807e-05
ASV_3 6.674871e-03 0.014117702 2.265089e-02 3.696079e-03 1.055352e-02
ASV_4 1.244014e-03 0.001524337 1.231027e-05 4.769134e-05 6.720968e-04
ASV_5 2.675299e-05 0.000000000 0.000000e+00 7.948557e-06 1.040150e-04
# Then aglomerate the ASVs at the phylum level using the phyloseq function, tax_glom
DivisionGlommed_RA = tax_glom(ps_ra, "Division")
# and plot
Division_barplot <- plot_bar(DivisionGlommed_RA, x = "Sample", fill = "Division") +
scale_fill_manual(values = DivisionPalette) +
theme(legend.text = element_text(size = 10))
Division_barplot
# export
ggsave("Figures/Division_barplot.eps",Division_barplot, width = 15, height = 5, units = c("in"))
Lots of dinoflagellates and radiolaria. Makes sense. But the above is the distribution from all samples. Next make plots that indicate distributions across environmental gradients. Calculate averages and use bubble plots
Get average relative abundances from sample replicates
otu_table_mean_ra <-
mutate(data.frame(otu_table(ps_ra)), "103A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a103A","AE3b103A")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "198A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3b198A")), na.rm = TRUE)) %>% # Sample AE3a198A was removed
mutate(data.frame(otu_table(ps_ra)), "234A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a234A","AE3b234A")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "295A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a295A","AE3b295A")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "314A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a314A")), na.rm = TRUE)) %>% # Sample AE3b314A was removed
mutate(data.frame(otu_table(ps_ra)), "900AM" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a900AM","AE1b900AM")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "103B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a103B","AE3b103B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "198B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a198B","AE3b198B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "234B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a234B","AE3b234B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "295B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a295B","AE3b295B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "314B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a314B","AE3b314B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "900BM" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE3a900BM","AE1b900BM")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "143A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a143A","AE2b143A")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "200A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2b200A")), na.rm = TRUE)) %>% # AE2a200A was removed
mutate(data.frame(otu_table(ps_ra)), "237A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a237A","AE2b237A")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "247A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a247A","AE2b247A")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "267A" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a267A","AE2b267A")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "900AN" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a900AN")), na.rm = TRUE)) %>% # AE2b900AN was removed
mutate(data.frame(otu_table(ps_ra)), "143B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a143B","AE2b143B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "200B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2b200B")), na.rm = TRUE)) %>% # AE2a200B was removed
mutate(data.frame(otu_table(ps_ra)), "237B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a237B","AE2b237B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "247B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2a247B","AE2b247B")), na.rm = TRUE)) %>%
mutate(data.frame(otu_table(ps_ra)), "267B" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2b267B")), na.rm = TRUE)) %>% # AE2a267B was removed
mutate(data.frame(otu_table(ps_ra)), "900BN" = rowMeans(select(data.frame(otu_table(ps_ra)), c("AE2b900BN")), na.rm = TRUE)) # AE2a900BN was removed
otu_table_mean_ra <- otu_table_mean_ra[,unique(metadata$Replicate)]
otu_table_mean_ra
Make into new phyloseq object
metadata2 <- unique(select(metadata,!c('Sample Name',Type,colnames(SRARunTable))))
META2 <- sample_data(data.frame(metadata2, row.names = metadata2$Replicate))
ps_ra_mean <- phyloseq(otu_table(otu_table_mean_ra, taxa_are_rows = TRUE), TAX, TREE, META2)
# First aglomerate the ASVs at the phylum level using the phyloseq function, tax_glom
ps_ra_mean_division <- tax_glom(ps_ra_mean, "Division")
# and check by bar plotting
plot_bar(ps_ra_mean_division, x = "Sample", fill = "Division") +
scale_fill_manual(values = DivisionPalette)
Extract mean relative abundance, glommed by division, from the phyloseq object and pair it to taxonomic data
division_df <- data.frame(otu_table(ps_ra_mean_division))
colnames(division_df) <- colnames(otu_table(ps_ra_mean_division))
division_df$ASV <- rownames(division_df)
otu_table_mean_ra <- left_join(division_df, as_tibble(taxonomy, rownames = "ASV"), by = "ASV")
otu_table_mean_ra
Some manual curating for plottin
# Make a new column that has Supergroup-Division in same colum
otu_table_mean_ra$SupergroupDivision <- paste(otu_table_mean_ra$Supergroup, otu_table_mean_ra$Division)
otu_table_mean_ra
Pivot longer
otu_table_mean_ra <- pivot_longer(otu_table_mean_ra, cols = unique(metadata$Replicate), names_to = "Replicate", values_to = "Mean_RA")
otu_table_mean_ra
Join metadata
otu_table_mean_ra <- left_join(otu_table_mean_ra, unique(select(metadata, c("Replicate", "Depth", "SizeFraction", "Season", "OxCond", "Fluorescence", "BeamAtt", "O2", "Temp", "Salinity", "H2S", "ParticulateS", "TZVS", "CH4", "NO3", "NO2", "NH4", "PO4", "Chemoautotrophy", "BNP", "MicroAbun(x10^8 L^-1)", "FlagAbun(x10^5 L-1)", "VLP(x10^8 L-1)"))), by = "Replicate")
# Replace zeroes in RA with NA (better for plotting)
otu_table_mean_ra$Mean_RA[otu_table_mean_ra$Mean_RA == 0] <- NA
otu_table_mean_ra
# reorder some factors to make them plot in the order I want
otu_table_mean_ra$OxCond <- factor(otu_table_mean_ra$OxCond, levels = c("Oxycline", "ShallowAnoxic", "Euxinic"))
otu_table_mean_ra$SizeFraction <- factor(otu_table_mean_ra$SizeFraction, levels = c("PA", "FL"))
euk_divisions_bubbleplot_color <- ggplot(otu_table_mean_ra,aes (x = as.character(Depth), y = reorder(SupergroupDivision, Mean_RA, function(x){sum(x,na.rm = TRUE)}), color = OxCond)) +
geom_point(aes(size =Mean_RA))+
facet_wrap(Season~SizeFraction, scales = "free_x", drop= TRUE, ncol = 4) +
scale_size(range = c(1,15)) +
scale_size_area(breaks = c(0,.25,.5,.75,1), max_size = 6) +
xlab("Depth") +
ylab("") +
labs(size="Relative Abundance", color = "Redox Condition") +
scale_color_manual(values = c("blue", "red", "brown4")) +
theme_bw() +
theme(axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
axis.title.x= element_text(size=12),
axis.title.y= element_text(size=12))
Scale for 'size' is already present. Adding another scale for 'size', which will replace the existing
scale.
euk_divisions_bubbleplot_color
Save figure
# set explicit panel size so they will be consistent for all figures
euk_divisions_bubbleplot_color <- set_panel_size(euk_divisions_bubbleplot_color, width = unit(1.7, "in"), height = unit(5.5, "in"))
Removed 249 rows containing missing values (geom_point).
ggsave(filename = "Figures/euk_divisions_bubbleplot_color.eps", plot = euk_divisions_bubbleplot_color, units = c("in"), width = 14, height = 8, dpi = 300)
Filter to only Alveolates; glom by order
keeptaxa <- taxa_names(ps_ra_mean)[(as.data.frame(tax_table(ps_ra_mean))$Supergroup %in% c("Alveolata"))]
ps_ra_mean_alveolates <- prune_taxa(keeptaxa, ps_ra_mean)
ps_ra_mean_alveolate_orders <- tax_glom(ps_ra_mean_alveolates, "Order")
aveloates_df <- data.frame(otu_table(ps_ra_mean_alveolate_orders))
colnames(aveloates_df) <- colnames(otu_table(ps_ra_mean_alveolate_orders))
aveloates_df$ASV <- rownames(aveloates_df)
otu_table_mean_ra <- left_join(aveloates_df, as_tibble(taxonomy, rownames = "ASV"), by = "ASV")
otu_table_mean_ra
Some manual curating for plottin
# Make a new column that has descriptive taxonomy
otu_table_mean_ra$Descriptive <- paste(otu_table_mean_ra$Division, otu_table_mean_ra$Class, otu_table_mean_ra$Order)
otu_table_mean_ra
Pivot longer
otu_table_mean_ra <- pivot_longer(otu_table_mean_ra, cols = unique(metadata$Replicate), names_to = "Replicate", values_to = "Mean_RA")
otu_table_mean_ra
Join metadata
otu_table_mean_ra <- left_join(otu_table_mean_ra, unique(select(metadata, c("Replicate", "Depth", "SizeFraction", "Season", "OxCond", "Fluorescence", "BeamAtt", "O2", "Temp", "Salinity", "H2S", "ParticulateS", "TZVS", "CH4", "NO3", "NO2", "NH4", "PO4", "Chemoautotrophy", "BNP", "MicroAbun(x10^8 L^-1)", "FlagAbun(x10^5 L-1)", "VLP(x10^8 L-1)"))), by = "Replicate")
# Replace zeroes in RA with NA (better for plotting)
otu_table_mean_ra$Mean_RA[otu_table_mean_ra$Mean_RA == 0] <- NA
otu_table_mean_ra
# reorder some factors to make them plot in the order I want
otu_table_mean_ra$OxCond <- factor(otu_table_mean_ra$OxCond, levels = c("Oxycline", "ShallowAnoxic", "Euxinic"))
otu_table_mean_ra$SizeFraction <- factor(otu_table_mean_ra$SizeFraction, levels = c("PA", "FL"))
alveolata_bubbleplot_color <- ggplot(otu_table_mean_ra,aes (x = as.character(Depth), y = reorder(Descriptive, Mean_RA, function(x){sum(x,na.rm = TRUE)}), color = OxCond)) +
geom_point(aes(size =Mean_RA))+
facet_wrap(Season~SizeFraction, scales = "free_x", drop= TRUE, ncol = 4) +
scale_size(range = c(1,15)) +
scale_size_area(breaks = c(0,.25,.5,.75,1), max_size = 6) +
xlab("Depth") +
ylab("") +
labs(size="Relative Abundance", color = "Redox Condition") +
scale_color_manual(values = c("blue", "red", "brown4")) +
theme_bw() +
theme(axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
axis.title.x= element_text(size=12),
axis.title.y= element_text(size=12))
Scale for 'size' is already present. Adding another scale for 'size', which will replace the existing
scale.
alveolata_bubbleplot_color
Save figure
# set explicit panel size so they will be consistent for all figures
alveolata_bubbleplot_color <- set_panel_size(alveolata_bubbleplot_color, width = unit(1.7, "in"), height = unit(7, "in"))
Removed 724 rows containing missing values (geom_point).
ggsave(filename = "Figures/alveolata_bubbleplot_color.eps", plot = alveolata_bubbleplot_color, units = c("in"), width = 14, height = 8, dpi = 300)
Filter to only Rhizaria; glom by order
keeptaxa <- taxa_names(ps_ra_mean)[(as.data.frame(tax_table(ps_ra_mean))$Supergroup %in% c("Rhizaria"))]
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
6: In readChar(file, size, TRUE) : truncating string with embedded nuls
7: In readChar(file, size, TRUE) : truncating string with embedded nuls
ps_ra_mean_rhizaria <- prune_taxa(keeptaxa, ps_ra_mean)
ps_ra_mean_rhizaria_orders <- tax_glom(ps_ra_mean_rhizaria, "Order")
rhizaria_df <- data.frame(otu_table(ps_ra_mean_rhizaria_orders))
colnames(rhizaria_df) <- colnames(otu_table(ps_ra_mean_rhizaria_orders))
rhizaria_df$ASV <- rownames(rhizaria_df)
otu_table_mean_ra <- left_join(rhizaria_df, as_tibble(taxonomy, rownames = "ASV"), by = "ASV")
otu_table_mean_ra
Some manual curating for plottin
# Make a new column that has descriptive taxonomy
otu_table_mean_ra$Descriptive <- paste(otu_table_mean_ra$Division, otu_table_mean_ra$Class, otu_table_mean_ra$Order)
otu_table_mean_ra
Pivot longer
otu_table_mean_ra <- pivot_longer(otu_table_mean_ra, cols = unique(metadata$Replicate), names_to = "Replicate", values_to = "Mean_RA")
otu_table_mean_ra
Join metadata
otu_table_mean_ra <- left_join(otu_table_mean_ra, unique(select(metadata, c("Replicate", "Depth", "SizeFraction", "Season", "OxCond", "Fluorescence", "BeamAtt", "O2", "Temp", "Salinity", "H2S", "ParticulateS", "TZVS", "CH4", "NO3", "NO2", "NH4", "PO4", "Chemoautotrophy", "BNP", "MicroAbun(x10^8 L^-1)", "FlagAbun(x10^5 L-1)", "VLP(x10^8 L-1)"))), by = "Replicate")
# Replace zeroes in RA with NA (better for plotting)
otu_table_mean_ra$Mean_RA[otu_table_mean_ra$Mean_RA == 0] <- NA
otu_table_mean_ra
# reorder some factors to make them plot in the order I want
otu_table_mean_ra$OxCond <- factor(otu_table_mean_ra$OxCond, levels = c("Oxycline", "ShallowAnoxic", "Euxinic"))
otu_table_mean_ra$SizeFraction <- factor(otu_table_mean_ra$SizeFraction, levels = c("PA", "FL"))
rhizaria_bubbleplot_color <- ggplot(otu_table_mean_ra,aes (x = as.character(Depth), y = reorder(Descriptive, Mean_RA, function(x){sum(x,na.rm = TRUE)}), color = OxCond)) +
geom_point(aes(size =Mean_RA))+
facet_wrap(Season~SizeFraction, scales = "free_x", drop= TRUE, ncol = 4) +
scale_size(range = c(1,15)) +
scale_size_area(breaks = c(0,.25,.5,.75,1), max_size = 6) +
xlab("Depth") +
ylab("") +
labs(size="Relative Abundance", color = "Redox Condition") +
scale_color_manual(values = c("blue", "red", "brown4")) +
theme_bw() +
theme(axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
axis.title.x= element_text(size=12),
axis.title.y= element_text(size=12))
Scale for 'size' is already present. Adding another scale for 'size', which will replace the existing
scale.
rhizaria_bubbleplot_color
Save figure
# set explicit panel size so they will be consistent for all figures
rhizaria_bubbleplot_color <- set_panel_size(rhizaria_bubbleplot_color, width = unit(1.7, "in"), height = unit(5, "in"))
Removed 465 rows containing missing values (geom_point).
ggsave(filename = "Figures/rhizaria_bubbleplot_color.eps", plot = rhizaria_bubbleplot_color, units = c("in"), width = 14, height = 8, dpi = 300)
Filter to only Opisthokonta; glom by order
keeptaxa <- taxa_names(ps_ra_mean)[(as.data.frame(tax_table(ps_ra_mean))$Supergroup %in% c("Opisthokonta"))]
ps_ra_mean_opisthokonta <- prune_taxa(keeptaxa, ps_ra_mean)
ps_ra_mean_opisthokonta_orders <- tax_glom(ps_ra_mean_opisthokonta, "Order")
opisthokonta_df <- data.frame(otu_table(ps_ra_mean_opisthokonta_orders))
colnames(opisthokonta_df) <- colnames(otu_table(ps_ra_mean_opisthokonta_orders))
opisthokonta_df$ASV <- rownames(opisthokonta_df)
otu_table_mean_ra <- left_join(opisthokonta_df, as_tibble(taxonomy, rownames = "ASV"), by = "ASV")
otu_table_mean_ra
Some manual curating for plottin
# Make a new column that has descriptive taxonomy
otu_table_mean_ra$Descriptive <- paste(otu_table_mean_ra$Division, otu_table_mean_ra$Class, otu_table_mean_ra$Order)
otu_table_mean_ra
Pivot longer
otu_table_mean_ra <- pivot_longer(otu_table_mean_ra, cols = unique(metadata$Replicate), names_to = "Replicate", values_to = "Mean_RA")
otu_table_mean_ra
Join metadata
otu_table_mean_ra <- left_join(otu_table_mean_ra, unique(select(metadata, c("Replicate", "Depth", "SizeFraction", "Season", "OxCond", "Fluorescence", "BeamAtt", "O2", "Temp", "Salinity", "H2S", "ParticulateS", "TZVS", "CH4", "NO3", "NO2", "NH4", "PO4", "Chemoautotrophy", "BNP", "MicroAbun(x10^8 L^-1)", "FlagAbun(x10^5 L-1)", "VLP(x10^8 L-1)"))), by = "Replicate")
# Replace zeroes in RA with NA (better for plotting)
otu_table_mean_ra$Mean_RA[otu_table_mean_ra$Mean_RA == 0] <- NA
otu_table_mean_ra
# reorder some factors to make them plot in the order I want
otu_table_mean_ra$OxCond <- factor(otu_table_mean_ra$OxCond, levels = c("Oxycline", "ShallowAnoxic", "Euxinic"))
otu_table_mean_ra$SizeFraction <- factor(otu_table_mean_ra$SizeFraction, levels = c("PA", "FL"))
opithokonta_bubbleplot_color <- ggplot(otu_table_mean_ra,aes (x = as.character(Depth), y = reorder(Descriptive, Mean_RA, function(x){sum(x,na.rm = TRUE)}), color = OxCond)) +
geom_point(aes(size =Mean_RA))+
facet_wrap(Season~SizeFraction, scales = "free_x", drop= TRUE, ncol = 4) +
scale_size(range = c(1,15)) +
scale_size_area(breaks = c(0,.25,.5,.75,1), max_size = 6) +
xlab("Depth") +
ylab("") +
labs(size="Relative Abundance", color = "Redox Condition") +
scale_color_manual(values = c("blue", "red", "brown4")) +
theme_bw() +
theme(axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
axis.title.x= element_text(size=12),
axis.title.y= element_text(size=12))
Scale for 'size' is already present. Adding another scale for 'size', which will replace the existing
scale.
opithokonta_bubbleplot_color
Save figure
# set explicit panel size so they will be consistent for all figures
opithokonta_bubbleplot_color <- set_panel_size(opithokonta_bubbleplot_color, width = unit(1.7, "in"), height = unit(5, "in"))
Removed 447 rows containing missing values (geom_point).
ggsave(filename = "Figures/opithokonta_bubbleplot_color.eps", plot = opithokonta_bubbleplot_color, units = c("in"), width = 14, height = 8, dpi = 300)
Filter to only Stramenopiles; glom by order
keeptaxa <- taxa_names(ps_ra_mean)[(as.data.frame(tax_table(ps_ra_mean))$Supergroup %in% c("Stramenopiles"))]
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
5: In readChar(file, size, TRUE) : truncating string with embedded nuls
6: In readChar(file, size, TRUE) : truncating string with embedded nuls
7: In readChar(file, size, TRUE) : truncating string with embedded nuls
8: In readChar(file, size, TRUE) : truncating string with embedded nuls
9: In readChar(file, size, TRUE) : truncating string with embedded nuls
ps_ra_mean_stramenopiles <- prune_taxa(keeptaxa, ps_ra_mean)
ps_ra_mean_stramenopiles_orders <- tax_glom(ps_ra_mean_stramenopiles, "Order")
stramenopiles_df <- data.frame(otu_table(ps_ra_mean_stramenopiles_orders))
colnames(stramenopiles_df) <- colnames(otu_table(ps_ra_mean_stramenopiles_orders))
stramenopiles_df$ASV <- rownames(stramenopiles_df)
otu_table_mean_ra <- left_join(stramenopiles_df, as_tibble(taxonomy, rownames = "ASV"), by = "ASV")
otu_table_mean_ra
Some manual curating for plottin
# Make a new column that has descriptive taxonomy
otu_table_mean_ra$Descriptive <- paste(otu_table_mean_ra$Division, otu_table_mean_ra$Class, otu_table_mean_ra$Order)
otu_table_mean_ra
Pivot longer
otu_table_mean_ra <- pivot_longer(otu_table_mean_ra, cols = unique(metadata$Replicate), names_to = "Replicate", values_to = "Mean_RA")
otu_table_mean_ra
Join metadata
otu_table_mean_ra <- left_join(otu_table_mean_ra, unique(select(metadata, c("Replicate", "Depth", "SizeFraction", "Season", "OxCond", "Fluorescence", "BeamAtt", "O2", "Temp", "Salinity", "H2S", "ParticulateS", "TZVS", "CH4", "NO3", "NO2", "NH4", "PO4", "Chemoautotrophy", "BNP", "MicroAbun(x10^8 L^-1)", "FlagAbun(x10^5 L-1)", "VLP(x10^8 L-1)"))), by = "Replicate")
# Replace zeroes in RA with NA (better for plotting)
otu_table_mean_ra$Mean_RA[otu_table_mean_ra$Mean_RA == 0] <- NA
otu_table_mean_ra
# reorder some factors to make them plot in the order I want
otu_table_mean_ra$OxCond <- factor(otu_table_mean_ra$OxCond, levels = c("Oxycline", "ShallowAnoxic", "Euxinic"))
otu_table_mean_ra$SizeFraction <- factor(otu_table_mean_ra$SizeFraction, levels = c("PA", "FL"))
stramenopiles_bubbleplot_color <- ggplot(otu_table_mean_ra,aes (x = as.character(Depth), y = reorder(Descriptive, Mean_RA, function(x){sum(x,na.rm = TRUE)}), color = OxCond)) +
geom_point(aes(size =Mean_RA))+
facet_wrap(Season~SizeFraction, scales = "free_x", drop= TRUE, ncol = 4) +
scale_size(range = c(1,15)) +
scale_size_area(breaks = c(0,.25,.5,.75,1), max_size = 6) +
xlab("Depth") +
ylab("") +
labs(size="Relative Abundance", color = "Redox Condition") +
scale_color_manual(values = c("blue", "red", "brown4")) +
theme_bw() +
theme(axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
axis.title.x= element_text(size=12),
axis.title.y= element_text(size=12))
Scale for 'size' is already present. Adding another scale for 'size', which will replace the existing
scale.
stramenopiles_bubbleplot_color
Save figure
# set explicit panel size so they will be consistent for all figures
stramenopiles_bubbleplot_color <- set_panel_size(stramenopiles_bubbleplot_color, width = unit(1.7, "in"), height = unit(10, "in"))
Removed 1407 rows containing missing values (geom_point).
ggsave(filename = "Figures/stramenopiles_bubbleplot_color.eps", plot = stramenopiles_bubbleplot_color, units = c("in"), width = 14, height = 11, dpi = 300)
shannons <- vegan::diversity(t(otu_table(ps)), index = "shannon")
shannons <- t(shannons)
shannons
AE3a103A AE3b103A AE1b900AM AE3a103B AE3b103B AE3a198B AE3b198B AE3a234B AE3b234B AE3a295B AE3b295B AE3a314B AE3b198A AE3b314B
[1,] 4.871221 4.956114 2.916447 4.192101 5.048457 5.352167 5.143548 5.169616 4.959116 2.736109 3.53949 2.780448 4.391812 3.143426
AE3a900BM AE1b900BM AE2a143A AE2b143A AE2b200A AE2a237A AE2b237A AE2a247A AE3a234A AE2b247A AE2a267A AE2b267A AE2a900AN
[1,] 3.137984 2.137569 3.083671 4.690686 3.128682 4.191647 4.308389 2.398659 5.334367 2.36533 3.826925 3.929226 3.047765
AE2a143B AE2b143B AE2b200B AE2a237B AE3b234A AE2b237B AE2a247B AE2b247B AE2b267B AE2b900BN AE3a295A AE3b295A AE3a314A
[1,] 4.962882 3.019449 4.772924 2.413723 4.62931 3.37624 2.595961 2.714695 4.361093 4.492629 3.07776 2.638438 4.522401
AE3a900AM
[1,] 3.592396
shannons_mean <-
mutate(data.frame(shannons), "103A" = rowMeans(select(data.frame(shannons), c("AE3a103A","AE3b103A")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "198A" = rowMeans(select(data.frame(shannons), c("AE3b198A")), na.rm = TRUE)) %>% # Sample AE3a198A was removed
mutate(data.frame(shannons), "234A" = rowMeans(select(data.frame(shannons), c("AE3a234A","AE3b234A")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "295A" = rowMeans(select(data.frame(shannons), c("AE3a295A","AE3b295A")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "314A" = rowMeans(select(data.frame(shannons), c("AE3a314A")), na.rm = TRUE)) %>% # Sample AE3b314A was removed
mutate(data.frame(shannons), "900AM" = rowMeans(select(data.frame(shannons), c("AE3a900AM","AE1b900AM")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "103B" = rowMeans(select(data.frame(shannons), c("AE3a103B","AE3b103B")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "198B" = rowMeans(select(data.frame(shannons), c("AE3a198B","AE3b198B")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "234B" = rowMeans(select(data.frame(shannons), c("AE3a234B","AE3b234B")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "295B" = rowMeans(select(data.frame(shannons), c("AE3a295B","AE3b295B")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "314B" = rowMeans(select(data.frame(shannons), c("AE3a314B","AE3b314B")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "900BM" = rowMeans(select(data.frame(shannons), c("AE3a900BM","AE1b900BM")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "143A" = rowMeans(select(data.frame(shannons), c("AE2a143A","AE2b143A")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "200A" = rowMeans(select(data.frame(shannons), c("AE2b200A")), na.rm = TRUE)) %>% # AE2a200A was removed
mutate(data.frame(shannons), "237A" = rowMeans(select(data.frame(shannons), c("AE2a237A","AE2b237A")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "247A" = rowMeans(select(data.frame(shannons), c("AE2a247A","AE2b247A")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "267A" = rowMeans(select(data.frame(shannons), c("AE2a267A","AE2b267A")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "900AN" = rowMeans(select(data.frame(shannons), c("AE2a900AN")), na.rm = TRUE)) %>% # AE2b900AN was removed
mutate(data.frame(shannons), "143B" = rowMeans(select(data.frame(shannons), c("AE2a143B","AE2b143B")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "200B" = rowMeans(select(data.frame(shannons), c("AE2b200B")), na.rm = TRUE)) %>% # AE2a200B was removed
mutate(data.frame(shannons), "237B" = rowMeans(select(data.frame(shannons), c("AE2a237B","AE2b237B")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "247B" = rowMeans(select(data.frame(shannons), c("AE2a247B","AE2b247B")), na.rm = TRUE)) %>%
mutate(data.frame(shannons), "267B" = rowMeans(select(data.frame(shannons), c("AE2b267B")), na.rm = TRUE)) %>% # AE2a267B was removed
mutate(data.frame(shannons), "900BN" = rowMeans(select(data.frame(shannons), c("AE2b900BN")), na.rm = TRUE)) # AE2a900BN was removed
shannons_mean <- shannons_mean[,unique(metadata$Replicate)]
shannons_mean
# Pivot longer
shannons_mean <- pivot_longer(shannons_mean, cols = unique(metadata$Replicate), names_to = "Replicate", values_to = "Shannons")
# Join metadata
shannons_mean <- left_join(shannons_mean, unique(select(metadata, c("Replicate", "Depth", "SizeFraction", "Season", "OxCond", "Fluorescence", "BeamAtt", "O2", "Temp", "Salinity", "H2S", "ParticulateS", "TZVS", "CH4", "NO3", "NO2", "NH4", "PO4", "Chemoautotrophy", "BNP", "MicroAbun(x10^8 L^-1)", "FlagAbun(x10^5 L-1)", "VLP(x10^8 L-1)"))), by = "Replicate")
shannons_mean
NA
NA
# reorder some factors to make them plot in the order I want
shannons_mean$OxCond <- factor(shannons_mean$OxCond, levels = c("Oxycline", "ShallowAnoxic", "Euxinic"))
shannons_mean$SizeFraction <- factor(shannons_mean$SizeFraction, levels = c("PA", "FL"))
shannonsplot <- ggplot(shannons_mean, aes(x = Depth, y = Shannons, color = OxCond)) +
geom_line(size=1, color = "black", lty = "dotted") +
geom_point(size=3, shape = c(16)) +
scale_x_continuous(name = "depth [m]") +
scale_x_reverse(expand = c(0, 0)) +
coord_flip(xlim = c(910, 100)) +
theme_bw() +
theme(legend.position = "right") +
facet_wrap(Season~SizeFraction, drop= TRUE, ncol = 4) +
scale_color_manual(values = c("blue", "red", "brown4")) +
labs(color = "Redox Condition")
Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
shannonsplot
Export Plot
# set explicit panel size so they will be consistent for all figures
shannonsplot <- set_panel_size(shannonsplot, width = unit(1.7, "in"), height = unit(3.5, "in"))
ggsave(filename = "Figures/shannonsplot.eps", plot = shannonsplot, units = c("in"), width = 14, height = 11, dpi = 300)
McMurdie and Holmes (2013) filter out taxa that were not seen with more than 3 counts in at least 20% of the samples. Also add a pseduocount of 1 to all counts. This is so that later when we do different calculations (log, division, etc) we don’t get back errors due to zeroes
ps_filtered = filter_taxa(ps, function(x) sum(x > 3) > (0.2*length(x)), TRUE)
ps_filtered <- transform_sample_counts(ps_filtered, function(x) x+1)
# Also make a filtered version of the relative abundance count table (for plotting purposes)
ps_ra_filtered <- prune_taxa(taxa_names(ps_filtered),ps_ra) # prune from ps_ra object (relative abundances)
# check number of ASVs in each
ps
phyloseq-class experiment-level object
otu_table() OTU Table: [ 13427 taxa and 41 samples ]
sample_data() Sample Data: [ 41 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 13427 taxa by 8 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 13427 tips and 13426 internal nodes ]
ps_filtered
phyloseq-class experiment-level object
otu_table() OTU Table: [ 979 taxa and 41 samples ]
sample_data() Sample Data: [ 41 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 979 taxa by 8 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 979 tips and 978 internal nodes ]
ps_ra_filtered
phyloseq-class experiment-level object
otu_table() OTU Table: [ 979 taxa and 41 samples ]
sample_data() Sample Data: [ 41 samples by 66 sample variables ]
tax_table() Taxonomy Table: [ 979 taxa by 8 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 979 tips and 978 internal nodes ]
Reduced from 13,427 to 979 ASVs
Re-root tree again incase the root was removed
Lastly, reroot-the tree affiliated with the ps_filtered and ps_ra_filtered dataset, since the root may have been removed when filtering
out.group
[1] "ASV_972"
# Then use this outgroup to root the tree
new.tree1 <- ape::root(my.tree, outgroup=out.group, resolve.root=TRUE)
phy_tree(ps_filtered) <- new.tree1
phy_tree(ps_ra_filtered) <- new.tree1
# Check if tree is binary (not multichotomous)
is.binary.tree(phy_tree(ps_filtered))
[1] TRUE
based on Coenen et al. tutorials for clustering. See repo
# Estimate covariance matrix for OTUs
covariance_matrix <- as.matrix(otu_table(ps_filtered)) %*% t(otu_table(ps_filtered))
# %*% = matrix multiplication sign in R; used here to multiply OTU/ASV data matrix to itself to estimate covariance.
# Evaluate determinant of covariance matrix
cov_determinant <- det(covariance_matrix)
cov_determinant
[1] 0
The determinant of the covariance matrix (what we just calculated) is equivalent to the product of the proportion of variance explained by every PCA axis. If the determinant is 0, that means there is an axis which explains 0 variance that we can’t separate from the other axes. This means the data need to be transformed to be suitable for PCA.
PCA is essentially a type of PCoA using the Euclidean distance matrix as input. When combined with a log-ratio transformation of the count table, this is deemed appropriate for compositional datasets.
First do a CLR, centered log ratio transformation of the absolute abundance data (after filtering), as suggested by Gloor et al. 2017 and check the determinant of this matrix. Compare it to the determinant without any transformation. {Also commented out for now because takes a few minutes}
# Estimate covariance matrix for absolute abundance ASV table
covariance_matrix <- as.matrix(otu_table(ps_filtered)) %*% t(otu_table(ps_filtered))
# Evaluate determinant of covariance matrix
cov_determinant <- det(covariance_matrix)
# Estimate covariance matrix for CLR-transformed ASV table
clr_asv_table_ps_filtered <- data.frame(compositions::clr(t(otu_table(ps_filtered))))
## Check new determinant of clr transformed table
new_covdet <- det(as.matrix(clr_asv_table_ps_filtered) %*% t(clr_asv_table_ps_filtered))
# Compare
cov_determinant #Original Count Data
[1] 0
new_covdet # New
[1] 1.939146e+130
The determinant of the CLR-transformed table is not zero, so we can proceed with PCA of the CLR-transformed data.
Generate the PCA and visualize axes
# Generate a Principle Component Analysis (PCA) and evaluated based on the eigen decomposition from sample covariance matrix.
lograt_pca <- prcomp(clr_asv_table_ps_filtered)
# NOTE- this is equivalent to first making a Euclidean distance matrix using the CLR data table and then running a PCoA. A Euclidean distance matrix of a log-transformed data table = an Aitchison distance matrix. So this is equivalent to the compositional methods listed in Gloor et al.
# Visual representation with a screeplot
lograt_variances <- as.data.frame(lograt_pca$sdev^2/sum(lograt_pca$sdev^2)) %>% #Extract axes
# Format to plot
select(PercVar = 'lograt_pca$sdev^2/sum(lograt_pca$sdev^2)') %>%
rownames_to_column(var = "PCaxis") %>%
data.frame
head(lograt_variances)
# Plot screeplot
ggplot(lograt_variances, aes(x = as.numeric(PCaxis), y = PercVar)) +
geom_bar(stat = "identity", fill = "grey", color = "black") +
theme_minimal() +
theme(axis.title = element_text(color = "black", face = "bold", size = 10),
axis.text.y = element_text(color = "black", face = "bold"),
axis.text.x = element_blank()) +
labs(x = "PC axis", y = "% Variance", title = "Log-Ratio PCA Screeplot, CLR Tranformation")
First two axes explain a good proportion of variance: 24.8 + 13.4 = 38.2
Visualize the PCA
# lograt_pca$x #View PC values
pca_lograt_frame <- data.frame(lograt_pca$x) %>%
rownames_to_column(var = "Sample Name")
# Merge metadata into the pca data table
pca_lograt_frame <- left_join(pca_lograt_frame, metadata, by = "Sample Name")
pca_lograt_frame
# reorder some factors to make them plot in the order I want
# pca_lograt_frame$OxCond <- factor(pca_lograt_frame$OxCond, levels = c("Oxycline", "ShallowAnoxic", "Euxinic"))
# pca_lograt_frame$SizeFraction <- factor(pca_lograt_frame$SizeFraction, levels = c("PA", "FL"))
pca_lograt_frame <- pca_lograt_frame %>%
mutate(SizeFraction = fct_relevel(SizeFraction, "PA", "FL")) %>%
mutate(OxCond = fct_relevel(OxCond, "Oxycline", "ShallowAnoxic", "Euxinic"))
pca_lograt_frame
# Plot PCA with Redox Regime and Size fraction
pca_lograt_plot <- ggplot(pca_lograt_frame, aes(x = PC1, y = PC2, color = OxCond)) +
geom_point(aes(shape = SizeFraction), size = 4) +
ylab(paste0('PC2 ', round(lograt_variances[2,2]*100,2),'%')) + #Extract y axis value from variance
xlab(paste0('PC1 ', round(lograt_variances[1,2]*100,2),'%')) + #Extract x axis value from variance
ggtitle('CLR-Euclidean PCA') +
scale_color_manual(values = c("blue", "red", "brown4")) +
coord_fixed(ratio = 1) +
theme_bw()
pca_lograt_plot
Use vegan’s envfit to determine relationships between the ordination and environmental variables
# trim metadata to only samples that are in PCA
metadata_ordinations <- metadata[metadata$`Sample Name` %in% sample_data(ps_filtered)$Sample.Name,]
# sort metadata in same order as input to PCA
metadata_ordinations <- metadata_ordinations %>% arrange(factor(`Sample Name`, levels = rownames(clr_asv_table_ps_filtered)))
# fit environmental factors and save stats output
pca_envfit <- envfit(lograt_pca, metadata_ordinations, permutations = 1000)
capture.output(pca_envfit, file = "PCA_envfit_stat.txt")
Many of the typical variables that indicate redox condition are significant (O2, sulfide, NO3, Particulate S, TZVS, etc). I am going to plot O2, H2S, and size fraction only to compare to prokaryotes analysis from 2018 paper
Make each of the interesting variables their own ordination variables for plotting (exclude Station. This will be a color variable anyway and it’s not interesting)
pca_envfit_DO <- envfit(lograt_pca~O2, metadata_ordinations, permutations = 1000)
pca_envfit_H2S <- envfit(lograt_pca~H2S, metadata_ordinations, permutations = 1000)
pca_envfit_sizefraction <- envfit(lograt_pca~SizeFraction, metadata_ordinations, permutations = 1000)
Plot again, now with envfit results (can’t do ths in ggplot)
Import
arch_counts <- read_csv("Suter_2018_count_tables/Cariaco_AA_updated_raw.csv");
bac_counts <- read_csv("Suter_2018_count_tables/Cariaco_AB_updated_raw.csv");
Get sample names
bac_samples <- colnames(bac_counts)[2:49]
arch_samples <- colnames(arch_counts)[2:47]
bac_samples
arch_samples
Make separate taxonomy and count variables
arch_OTU <- arch_counts[,c("#OTU ID",arch_samples)]
arch_taxonomy <- arch_counts %>%
select(-arch_samples) %>%
select(-Sum)
arch_OTU
arch_taxonomy
bac_OTU <- bac_counts[,c("#OTU ID",bac_samples)]
bac_taxonomy <- bac_counts %>%
select(-bac_samples) %>%
select(-Sum) %>%
select(-"Interesting close relatives")
bac_OTU
bac_taxonomy
bac_OTU <- type_convert(as.data.frame(bac_OTU))
rownames(bac_OTU) <- bac_OTU$`#OTU ID`
bac_OTU <- bac_OTU[,!names(bac_OTU) %in% (c("#OTU ID"))]
bac_OTU = otu_table(bac_OTU, taxa_are_rows = TRUE)
#
arch_OTU <- type_convert(as.data.frame(arch_OTU))
rownames(arch_OTU) <- arch_OTU$`#OTU ID`
arch_OTU <- arch_OTU[,!names(arch_OTU) %in% (c("#OTU ID"))]
arch_OTU = otu_table(arch_OTU, taxa_are_rows = TRUE)
#
bac_TAX <- type_convert(as.data.frame(bac_taxonomy))
rownames(bac_TAX) <- bac_TAX$`#OTU ID`
bac_TAX <- bac_TAX[,!names(bac_TAX) %in% (c("#OTU ID"))]
bac_TAX = tax_table(as.matrix(bac_TAX))
#
arch_TAX <- type_convert(as.data.frame(arch_taxonomy))
rownames(arch_TAX) <- arch_TAX$`#OTU ID`
arch_TAX <- arch_TAX[,!names(arch_TAX) %in% (c("#OTU ID"))]
arch_TAX = tax_table(as.matrix(arch_TAX))
#
META = sample_data(data.frame(metadata, row.names = metadata$`Sample Name`))
#
ps_bac <- phyloseq(bac_OTU, bac_TAX, META)
ps_arch <- phyloseq(arch_OTU, arch_TAX, META)
Filter out the samples with low sequencing effort. These were previously identified for itags paper
taxa_to_keep_b <- !sample_names(ps_bac) %in% c("AB3a900A","AB2a200A","AB2b267A")
ps_bac <- prune_samples(taxa_to_keep_b, ps_bac)
taxa_to_keep_a <- !sample_names(ps_arch) %in% c("AA2b900AN","AA2a247B","AA2a900BN","AA2b900BN")
ps_arch <- prune_samples(taxa_to_keep_a, ps_arch)
First calculate relative abdunance of bac and arch OTU tables
ps_bac_ra <- microbiome::transform(ps_bac, transform = "compositional")
(otu_table(ps_bac_ra))[1:5,1:5]
ps_arch_ra <- microbiome::transform(ps_arch, transform = "compositional")
(otu_table(ps_arch_ra))[1:5,1:5]
Remove rows of glommed taxa from the full dataframe if their sum across all samples doesn’t exceed 5% (RA > 0.05)
# Bacteria
x <- taxa_sums(ps_bac_ra)
# keepTaxa <- base::which(x > .05)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_bac_ra_pruned <- prune_taxa(keepTaxa, ps_bac_ra)
ps_bac_pruned <- prune_taxa(keepTaxa, ps_bac)
ps_bac_ra_pruned
ps_bac_pruned
# Archaea
x <- taxa_sums(ps_arch_ra)
# keepTaxa <- base::which(x > .05)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_arch_ra_pruned <- prune_taxa(keepTaxa, ps_arch_ra)
ps_arch_pruned <- prune_taxa(keepTaxa, ps_arch)
ps_arch_ra_pruned
ps_arch_pruned
# Eukaryotes
x <- taxa_sums(ps_ra)
# keepTaxa <- base::which(x > .05)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_euk_ra_pruned <- prune_taxa(keepTaxa, ps_ra)
ps_euk_pruned <- prune_taxa(keepTaxa, ps)
ps_euk_ra_pruned
ps_euk_pruned
Trimmed to 124 bacteria OTUs, 52 archaea OTUs, and 123 eukaryotic ASVs (299 total). Proceed with this dataset of the most abundant OTUs for correlations and network analyses…
To do the multi-domain analysis, the sample names from each phyloseq object must match. These currently have “B” for bacteria, A, E etc. Remove this letter from sample names so that “AE2a247B”, “AA2a247B”, “AB2a247B” all become just “Type” from the metadata sheet [IntNov1FL in this case- for Interface, November, rep 1, free-living].
Import my SampleKey
samplekey <- read_csv("SampleKey.csv")
Change the sample names in the otu tables to sample “Type”
# Archaea
# remove missing archaea samples from samplekey_A
samplekey_A <- filter(samplekey, SampleID_arch %in% colnames(otu_table(ps_arch_ra_pruned)))
# sort SampleKey by order of column names from ps_arch_ra_pruned
samplekey_A <- samplekey_A %>% arrange(factor(SampleID_arch, levels = colnames(otu_table(ps_arch_ra_pruned))))
# replace col names of otu table from ps_arch_ra_pruned
sample_names(ps_arch_ra_pruned) <- samplekey_A$Type
# and ps_arch_pruned
sample_names(ps_arch_pruned) <- samplekey_A$Type
# Bacteria
samplekey_B <- filter(samplekey, SampleID_bac %in% colnames(otu_table(ps_bac_ra_pruned)))
samplekey_B <- samplekey_B %>% arrange(factor(SampleID_bac, levels = colnames(otu_table(ps_bac_ra_pruned))))
sample_names(ps_bac_ra_pruned) <- samplekey_B$Type
sample_names(ps_bac_pruned) <- samplekey_B$Type
# Eukaryotes
samplekey_E <- filter(samplekey, SampleID_euk %in% colnames(otu_table(ps_euk_ra_pruned)))
samplekey_E <- samplekey_E %>% arrange(factor(SampleID_euk, levels = colnames(otu_table(ps_euk_ra_pruned))))
sample_names(ps_euk_ra_pruned) <- samplekey_E$Type
sample_names(ps_euk_pruned) <- samplekey_E$Type
Move all pruned otu tables into one table by matching the sample Type- will use this for SparCC Make one for the 3-domain analysis and one for the 2-domain analysis (bacteria and archaea only)
alldomains_df <- bind_rows(data.frame(otu_table(ps_bac_pruned)), data.frame(otu_table(ps_arch_pruned)), data.frame(otu_table(ps_euk_pruned)))
alldomains_df
twodomains_df <- bind_rows(data.frame(otu_table(ps_bac_pruned)), data.frame(otu_table(ps_arch_pruned)))
twodomains_df
Change row names from “denovoXXX” to meaningful names
alldomains_df_full <- cbind(ID = rownames(alldomains_df), alldomains_df)
twodomains_df_full <- cbind(ID = rownames(twodomains_df), twodomains_df)
# start with only first rows, which are bacteria. make one column of meaningful labels
temp1 <- left_join(alldomains_df_full[1:dim(otu_table(ps_bac_pruned))[1],], bac_taxonomy, by = c("ID" = "#OTU ID"))
temp1$New_ID <- paste(temp1$ID, temp1$"taxonomy-2", temp1$"taxonomy-3", temp1$"taxonomy-4")
temp1 <- select(temp1,-colnames(bac_taxonomy[,2:11]))
# next rows are the archaea
temp2 <- left_join(alldomains_df_full[sum(dim(otu_table(ps_bac_pruned))[1],1):sum(dim(otu_table(ps_bac_pruned))[1],dim(otu_table(ps_arch_pruned))[1]),], arch_taxonomy, by = c("ID" = "#OTU ID"))
temp2$New_ID <- paste(temp2$ID, temp2$"taxonomy-2", temp2$"taxonomy-3")
temp2 <- select(temp2,-colnames(arch_taxonomy[,2:9]))
# last rows are eukarya
euk_taxonomy <- cbind("#ASV ID" = rownames(taxonomy), taxonomy)
temp3 <- left_join(alldomains_df_full[sum(dim(otu_table(ps_arch_pruned))[1], dim(otu_table(ps_bac_pruned))[1],1):sum(dim(otu_table(ps_arch_pruned))[1], dim(otu_table(ps_bac_pruned))[1],dim(otu_table(ps_euk_pruned))[1]),], euk_taxonomy, by = c("ID" = "#ASV ID"))
temp3$New_ID <- paste(temp3$ID, temp3$"Supergroup", temp3$"Division", temp3$"Class", temp3$"Order")
temp3 <- select(temp3,-colnames(euk_taxonomy[,2:9]))
# combine back all 3 domains, with new names as row names in a dataframe
alldomains_df_full <- rbind(temp1, temp2, temp3)
alldomains_df_full <- data.frame(alldomains_df_full)
rownames(alldomains_df_full) <- alldomains_df_full$New_ID
alldomains_df_full <- select(alldomains_df_full, -c("ID","New_ID"))
# and make one for the 2-domain dataset
twodomains_df_full <- rbind(temp1, temp2)
twodomains_df_full <- data.frame(twodomains_df_full)
rownames(twodomains_df_full) <- twodomains_df_full$New_ID
twodomains_df_full <- select(twodomains_df_full, -c("ID","New_ID"))
Remove columns with NAs. These are samples for which the library for at least one domain didn’t work (can’t do correlations with missing values in columns)
alldomains_df_full <- alldomains_df_full %>%
select_if(~ !any(is.na(.)))
alldomains_df_full
alldomains_df <- alldomains_df %>%
select_if(~ !any(is.na(.)))
alldomains_df
twodomains_df_full <- twodomains_df_full %>%
select_if(~ !any(is.na(.)))
twodomains_df_full
twodomains_df <- twodomains_df %>%
select_if(~ !any(is.na(.)))
twodomains_df
Simlarly, make pruned datasets of the most abundant OTUs/ASVs in the oxycline, anoxic, and euxinic samples as separate datasets
Pull out samples and taxa from each redox regime
# Pull out oxycline bacteria sample IDs
oxyclinetypes_bac <- metadata %>%
filter(`Sample Name` %in% sample_names(ps_bac)) %>%
filter(OxCond == "Oxycline") %>%
select("Sample Name")
oxyclinetypes_bac <- unlist(c(unique(oxyclinetypes_bac)), use.names = FALSE)
# Pull out all bacteria from oxycline
ps_bac_oxycline <- prune_samples(oxyclinetypes_bac, ps_bac)
ps_bac_ra_oxycline <- prune_samples(oxyclinetypes_bac, ps_bac_ra)
# Pull out oxycline archaea sample IDs
oxyclinetypes_arch <- metadata %>%
filter(`Sample Name` %in% sample_names(ps_arch)) %>%
filter(OxCond == "Oxycline") %>%
select("Sample Name")
oxyclinetypes_arch <- unlist(c(unique(oxyclinetypes_arch)), use.names = FALSE)
# Pull out all archaea from oxycline
ps_arch_oxycline <- prune_samples(oxyclinetypes_arch, ps_arch)
ps_arch_ra_oxycline <- prune_samples(oxyclinetypes_arch, ps_arch_ra)
# Pull out oxycline eukaryotic sample IDs
oxyclinetypes_euk <- metadata %>%
filter(`Sample Name` %in% sample_names(ps)) %>%
filter(OxCond == "Oxycline") %>%
select("Sample Name")
oxyclinetypes_euk <- unlist(c(unique(oxyclinetypes_euk)), use.names = FALSE)
# Pull out all eukaryotes from oxycline
ps_euk_oxycline <- prune_samples(oxyclinetypes_euk, ps)
ps_euk_ra_oxycline <- prune_samples(oxyclinetypes_euk, ps_ra)
Filter out low abundance taxa from the oxycline samples. Use 5% as cutoff
# Bacteria
x <- taxa_sums(ps_bac_ra_oxycline)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_bac_ra_oxycline_pruned <- prune_taxa(keepTaxa, ps_bac_ra_oxycline)
ps_bac_oxycline_pruned <- prune_taxa(keepTaxa, ps_bac_oxycline)
ps_bac_ra_oxycline_pruned
ps_bac_oxycline_pruned
# Archaea
x <- taxa_sums(ps_arch_ra_oxycline)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_arch_ra_oxycline_pruned <- prune_taxa(keepTaxa, ps_arch_ra_oxycline)
ps_arch_oxycline_pruned <- prune_taxa(keepTaxa, ps_arch_oxycline)
ps_arch_ra_oxycline_pruned
ps_arch_oxycline_pruned
# Eukaryotes
x <- taxa_sums(ps_euk_ra_oxycline)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_euk_ra_oxycline_pruned <- prune_taxa(keepTaxa, ps_euk_ra_oxycline)
ps_euk_oxycline_pruned <- prune_taxa(keepTaxa, ps_euk_oxycline)
ps_euk_ra_oxycline_pruned
ps_euk_oxycline_pruned
79 bacteria, 36 archaea, 76 eukaryota remain
Change the sample names in the otu tables to “Type”
# Archaea
# remove missing archaea samples from samplekey_A
samplekey_A <- filter(samplekey, SampleID_arch %in% colnames(otu_table(ps_arch_ra_oxycline_pruned)))
# sort SampleKey by order of column names from ps_arch_ra_oxycline_pruned
samplekey_A <- samplekey_A %>% arrange(factor(SampleID_arch, levels = colnames(otu_table(ps_arch_ra_oxycline_pruned))))
# replace col names of otu table from ps_arch_ra_oxycline_pruned
sample_names(ps_arch_ra_oxycline_pruned) <- samplekey_A$Type
# and ps_arch_pruned
sample_names(ps_arch_oxycline_pruned) <- samplekey_A$Type
# Bacteria
samplekey_B <- filter(samplekey, SampleID_bac %in% colnames(otu_table(ps_bac_ra_oxycline_pruned)))
samplekey_B <- samplekey_B %>% arrange(factor(SampleID_bac, levels = colnames(otu_table(ps_bac_ra_oxycline_pruned))))
sample_names(ps_bac_ra_oxycline_pruned) <- samplekey_B$Type
sample_names(ps_bac_oxycline_pruned) <- samplekey_B$Type
# Eukaryotes
samplekey_E <- filter(samplekey, SampleID_euk %in% colnames(otu_table(ps_euk_ra_oxycline_pruned)))
samplekey_E <- samplekey_E %>% arrange(factor(SampleID_euk, levels = colnames(otu_table(ps_euk_ra_oxycline_pruned))))
sample_names(ps_euk_ra_oxycline_pruned) <- samplekey_E$Type
sample_names(ps_euk_oxycline_pruned) <- samplekey_E$Type
Move all pruned otu tables into one table by matching the sample Type- will use this for SparCC
alldomains_df_oxycline <- bind_rows(data.frame(otu_table(ps_bac_oxycline_pruned)), data.frame(otu_table(ps_arch_oxycline_pruned)), data.frame(otu_table(ps_euk_oxycline_pruned)))
alldomains_df_oxycline
Change row names from “denovoXXX” to meaningful names
alldomains_df_full_oxycline <- cbind(ID = rownames(alldomains_df_oxycline), alldomains_df_oxycline)
# start with only first rows, which are bacteria. make one column of meaningful labels
temp1 <- left_join(alldomains_df_full_oxycline[1:dim(otu_table(ps_bac_oxycline_pruned))[1],], bac_taxonomy, by = c("ID" = "#OTU ID"))
temp1$New_ID <- paste(temp1$ID, temp1$"taxonomy-2", temp1$"taxonomy-3", temp1$"taxonomy-4")
temp1 <- select(temp1,-colnames(bac_taxonomy[,2:11]))
# next rows are the archaea
temp2 <- left_join(alldomains_df_full_oxycline[sum(dim(otu_table(ps_bac_oxycline_pruned))[1],1):sum(dim(otu_table(ps_bac_oxycline_pruned))[1],dim(otu_table(ps_arch_oxycline_pruned))[1]),], arch_taxonomy, by = c("ID" = "#OTU ID"))
temp2$New_ID <- paste(temp2$ID, temp2$"taxonomy-2", temp2$"taxonomy-3")
temp2 <- select(temp2,-colnames(arch_taxonomy[,2:9]))
# last rows are eukarya
euk_taxonomy <- cbind("#ASV ID" = rownames(taxonomy), taxonomy)
temp3 <- left_join(alldomains_df_full_oxycline[sum(dim(otu_table(ps_arch_oxycline_pruned))[1], dim(otu_table(ps_bac_oxycline_pruned))[1],1):sum(dim(otu_table(ps_arch_oxycline_pruned))[1], dim(otu_table(ps_bac_oxycline_pruned))[1],dim(otu_table(ps_euk_oxycline_pruned))[1]),], euk_taxonomy, by = c("ID" = "#ASV ID"))
temp3$New_ID <- paste(temp3$ID, temp3$"Supergroup", temp3$"Division", temp3$"Class", temp3$"Order")
temp3 <- select(temp3,-colnames(euk_taxonomy[,2:9]))
# combine back all 3 domains, with new names as row names in a dataframe
alldomains_df_full_oxycline <- rbind(temp1, temp2, temp3)
alldomains_df_full_oxycline <- data.frame(alldomains_df_full_oxycline)
rownames(alldomains_df_full_oxycline) <- alldomains_df_full_oxycline$New_ID
alldomains_df_full_oxycline <- select(alldomains_df_full_oxycline, -c("ID","New_ID"))
alldomains_df_full_oxycline
Remove columns with NAs. These are samples for which the library for at least one domain didn’t work (can’t do correlations with missing values in columns)
alldomains_df_full_oxycline <- alldomains_df_full_oxycline %>%
select_if(~ !any(is.na(.)))
alldomains_df_full_oxycline
alldomains_df_oxycline <- alldomains_df_oxycline %>%
select_if(~ !any(is.na(.)))
alldomains_df_oxycline
21 samples remain for correlation
Pull out samples from shallow anoxic regime
# Pull out anoxic layer bacteria sample IDs
anoxictypes_bac <- metadata %>%
filter(`Sample Name` %in% sample_names(ps_bac)) %>%
filter(OxCond == "ShallowAnoxic") %>%
select("Sample Name")
anoxictypes_bac <- unlist(c(unique(anoxictypes_bac)), use.names = FALSE)
# Pull out all bacteria from anoxic layer
ps_bac_anoxic <- prune_samples(anoxictypes_bac, ps_bac)
ps_bac_ra_anoxic <- prune_samples(anoxictypes_bac, ps_bac_ra)
# Pull out anoxic layer archaea sample IDs
anoxictypes_arch <- metadata %>%
filter(`Sample Name` %in% sample_names(ps_arch)) %>%
filter(OxCond == "ShallowAnoxic") %>%
select("Sample Name")
anoxictypes_arch <- unlist(c(unique(anoxictypes_arch)), use.names = FALSE)
# Pull out all archaea from anoxic layer
ps_arch_anoxic<- prune_samples(anoxictypes_arch, ps_arch)
ps_arch_ra_anoxic <- prune_samples(anoxictypes_arch, ps_arch_ra)
# Pull out anoxic layer eukaryotic sample IDs
anoxictypes_euk <- metadata %>%
filter(`Sample Name` %in% sample_names(ps)) %>%
filter(OxCond == "ShallowAnoxic") %>%
select("Sample Name")
anoxictypes_euk <- unlist(c(unique(anoxictypes_euk)), use.names = FALSE)
# Pull out all eukaryotes from anoxic layer
ps_euk_anoxic <- prune_samples(anoxictypes_euk, ps)
ps_euk_ra_anoxic <- prune_samples(anoxictypes_euk, ps_ra)
Filter out low abundance taxa from the oxycline samples. Use 5% as cutoff
# Bacteria
x <- taxa_sums(ps_bac_ra_anoxic)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_bac_ra_anoxic_pruned <- prune_taxa(keepTaxa, ps_bac_ra_anoxic)
ps_bac_anoxic_pruned <- prune_taxa(keepTaxa, ps_bac_anoxic)
ps_bac_ra_anoxic_pruned
ps_bac_anoxic_pruned
# Archaea
x <- taxa_sums(ps_arch_ra_anoxic)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_arch_ra_anoxic_pruned <- prune_taxa(keepTaxa, ps_arch_ra_anoxic)
ps_arch_anoxic_pruned <- prune_taxa(keepTaxa, ps_arch_anoxic)
ps_arch_ra_anoxic_pruned
ps_arch_anoxic_pruned
# Eukaryotes
x <- taxa_sums(ps_euk_ra_anoxic)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_euk_ra_anoxic_pruned <- prune_taxa(keepTaxa, ps_euk_ra_anoxic)
ps_euk_anoxic_pruned <- prune_taxa(keepTaxa, ps_euk_anoxic)
ps_euk_ra_anoxic_pruned
ps_euk_anoxic_pruned
32 bacteria, 19 archaea, 37 eukaryota remain
Change the sample names in the otu tables to “Type”
# Archaea
# remove missing archaea samples from samplekey_A
samplekey_A <- filter(samplekey, SampleID_arch %in% colnames(otu_table(ps_arch_ra_anoxic_pruned)))
# sort SampleKey by order of column names from ps_arch_ra_anoxic_pruned
samplekey_A <- samplekey_A %>% arrange(factor(SampleID_arch, levels = colnames(otu_table(ps_arch_ra_anoxic_pruned))))
# replace col names of otu table from ps_arch_ra_anoxic_pruned
sample_names(ps_arch_ra_anoxic_pruned) <- samplekey_A$Type
# and ps_arch_pruned
sample_names(ps_arch_anoxic_pruned) <- samplekey_A$Type
# Bacteria
samplekey_B <- filter(samplekey, SampleID_bac %in% colnames(otu_table(ps_bac_ra_anoxic_pruned)))
samplekey_B <- samplekey_B %>% arrange(factor(SampleID_bac, levels = colnames(otu_table(ps_bac_ra_anoxic_pruned))))
sample_names(ps_bac_ra_anoxic_pruned) <- samplekey_B$Type
sample_names(ps_bac_anoxic_pruned) <- samplekey_B$Type
# Eukaryotes
samplekey_E <- filter(samplekey, SampleID_euk %in% colnames(otu_table(ps_euk_ra_anoxic_pruned)))
samplekey_E <- samplekey_E %>% arrange(factor(SampleID_euk, levels = colnames(otu_table(ps_euk_ra_anoxic_pruned))))
sample_names(ps_euk_ra_anoxic_pruned) <- samplekey_E$Type
sample_names(ps_euk_anoxic_pruned) <- samplekey_E$Type
Move all pruned otu tables into one table by matching the sample Type- will use this for SparCC
alldomains_df_anoxic <- bind_rows(data.frame(otu_table(ps_bac_anoxic_pruned)), data.frame(otu_table(ps_arch_anoxic_pruned)), data.frame(otu_table(ps_euk_anoxic_pruned)))
alldomains_df_anoxic
Change row names from “denovoXXX” to meaningful names
alldomains_df_full_anoxic <- cbind(ID = rownames(alldomains_df_anoxic), alldomains_df_anoxic)
# start with only first rows, which are bacteria. make one column of meaningful labels
temp1 <- left_join(alldomains_df_full_anoxic[1:dim(otu_table(ps_bac_anoxic_pruned))[1],], bac_taxonomy, by = c("ID" = "#OTU ID"))
temp1$New_ID <- paste(temp1$ID, temp1$"taxonomy-2", temp1$"taxonomy-3", temp1$"taxonomy-4")
temp1 <- select(temp1,-colnames(bac_taxonomy[,2:11]))
# next rows are the archaea
temp2 <- left_join(alldomains_df_full_anoxic[sum(dim(otu_table(ps_bac_anoxic_pruned))[1],1):sum(dim(otu_table(ps_bac_anoxic_pruned))[1],dim(otu_table(ps_arch_anoxic_pruned))[1]),], arch_taxonomy, by = c("ID" = "#OTU ID"))
temp2$New_ID <- paste(temp2$ID, temp2$"taxonomy-2", temp2$"taxonomy-3")
temp2 <- select(temp2,-colnames(arch_taxonomy[,2:9]))
# last rows are eukarya
euk_taxonomy <- cbind("#ASV ID" = rownames(taxonomy), taxonomy)
temp3 <- left_join(alldomains_df_full_anoxic[sum(dim(otu_table(ps_arch_anoxic_pruned))[1], dim(otu_table(ps_bac_anoxic_pruned))[1],1):sum(dim(otu_table(ps_arch_anoxic_pruned))[1], dim(otu_table(ps_bac_anoxic_pruned))[1],dim(otu_table(ps_euk_anoxic_pruned))[1]),], euk_taxonomy, by = c("ID" = "#ASV ID"))
temp3$New_ID <- paste(temp3$ID, temp3$"Supergroup", temp3$"Division", temp3$"Class", temp3$"Order")
temp3 <- select(temp3,-colnames(euk_taxonomy[,2:9]))
# combine back all 3 domains, with new names as row names in a dataframe
alldomains_df_full_anoxic <- rbind(temp1, temp2, temp3)
alldomains_df_full_anoxic <- data.frame(alldomains_df_full_anoxic)
rownames(alldomains_df_full_anoxic) <- alldomains_df_full_anoxic$New_ID
alldomains_df_full_anoxic <- select(alldomains_df_full_anoxic, -c("ID","New_ID"))
alldomains_df_full_anoxic
Remove columns with NAs. These are samples for which the library for at least one domain didn’t work (can’t do correlations with missing values in columns)
alldomains_df_full_anoxic <- alldomains_df_full_anoxic %>%
select_if(~ !any(is.na(.)))
alldomains_df_full_anoxic
alldomains_df_anoxic <- alldomains_df_anoxic %>%
select_if(~ !any(is.na(.)))
alldomains_df_anoxic
11 samples remain for correlation
Pull out samples from shallow anoxic regime
# Pull out anoxic layer bacteria sample IDs
euxinictypes_bac <- metadata %>%
filter(`Sample Name` %in% sample_names(ps_bac)) %>%
filter(OxCond == "Euxinic") %>%
select("Sample Name")
euxinictypes_bac <- unlist(c(unique(euxinictypes_bac)), use.names = FALSE)
# Pull out all bacteria from euxinic layer
ps_bac_euxinic <- prune_samples(euxinictypes_bac, ps_bac)
ps_bac_ra_euxinic <- prune_samples(euxinictypes_bac, ps_bac_ra)
# Pull out euxinic layer archaea sample IDs
euxinictypes_arch <- metadata %>%
filter(`Sample Name` %in% sample_names(ps_arch)) %>%
filter(OxCond == "Euxinic") %>%
select("Sample Name")
euxinictypes_arch <- unlist(c(unique(euxinictypes_arch)), use.names = FALSE)
# Pull out all archaea from euxinic layer
ps_arch_euxinic<- prune_samples(euxinictypes_arch, ps_arch)
ps_arch_ra_euxinic <- prune_samples(euxinictypes_arch, ps_arch_ra)
# Pull out euxinic layer eukaryotic sample IDs
euxinictypes_euk <- metadata %>%
filter(`Sample Name` %in% sample_names(ps)) %>%
filter(OxCond == "Euxinic") %>%
select("Sample Name")
euxinictypes_euk <- unlist(c(unique(euxinictypes_euk)), use.names = FALSE)
# Pull out all eukaryotes from euxinic layer
ps_euk_euxinic <- prune_samples(euxinictypes_euk, ps)
ps_euk_ra_euxinic <- prune_samples(euxinictypes_euk, ps_ra)
Filter out low abundance taxa from the oxycline samples. Use 5% as cutoff
# Bacteria
x <- taxa_sums(ps_bac_ra_euxinic)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_bac_ra_euxinic_pruned <- prune_taxa(keepTaxa, ps_bac_ra_euxinic)
ps_bac_euxinic_pruned <- prune_taxa(keepTaxa, ps_bac_euxinic)
ps_bac_ra_euxinic_pruned
ps_bac_euxinic_pruned
# Archaea
x <- taxa_sums(ps_arch_ra_euxinic)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_arch_ra_euxinic_pruned <- prune_taxa(keepTaxa, ps_arch_ra_euxinic)
ps_arch_euxinic_pruned <- prune_taxa(keepTaxa, ps_arch_euxinic)
ps_arch_ra_euxinic_pruned
ps_arch_euxinic_pruned
# Eukaryotes
x <- taxa_sums(ps_euk_ra_euxinic)
keepTaxa <- x>.05 # prune_taxa require a logical not a list of IDs. compare to keepTaxa above to check
ps_euk_ra_euxinic_pruned <- prune_taxa(keepTaxa, ps_euk_ra_euxinic)
ps_euk_euxinic_pruned <- prune_taxa(keepTaxa, ps_euk_euxinic)
ps_euk_ra_euxinic_pruned
ps_euk_euxinic_pruned
16 bacteria, 16 archaea, 20 eukaryota remain
Change the sample names in the otu tables to “Type”
# Archaea
# remove missing archaea samples from samplekey_A
samplekey_A <- filter(samplekey, SampleID_arch %in% colnames(otu_table(ps_arch_ra_euxinic_pruned)))
# sort SampleKey by order of column names from ps_arch_ra_euxinic_pruned
samplekey_A <- samplekey_A %>% arrange(factor(SampleID_arch, levels = colnames(otu_table(ps_arch_ra_euxinic_pruned))))
# replace col names of otu table from ps_arch_ra_euxinic_pruned
sample_names(ps_arch_ra_euxinic_pruned) <- samplekey_A$Type
# and ps_arch_pruned
sample_names(ps_arch_euxinic_pruned) <- samplekey_A$Type
# Bacteria
samplekey_B <- filter(samplekey, SampleID_bac %in% colnames(otu_table(ps_bac_ra_euxinic_pruned)))
samplekey_B <- samplekey_B %>% arrange(factor(SampleID_bac, levels = colnames(otu_table(ps_bac_ra_euxinic_pruned))))
sample_names(ps_bac_ra_euxinic_pruned) <- samplekey_B$Type
sample_names(ps_bac_euxinic_pruned) <- samplekey_B$Type
# Eukaryotes
samplekey_E <- filter(samplekey, SampleID_euk %in% colnames(otu_table(ps_euk_ra_euxinic_pruned)))
samplekey_E <- samplekey_E %>% arrange(factor(SampleID_euk, levels = colnames(otu_table(ps_euk_ra_euxinic_pruned))))
sample_names(ps_euk_ra_euxinic_pruned) <- samplekey_E$Type
sample_names(ps_euk_euxinic_pruned) <- samplekey_E$Type
Move all pruned otu tables into one table by matching the sample Type- will use this for SparCC
alldomains_df_euxinic <- bind_rows(data.frame(otu_table(ps_bac_euxinic_pruned)), data.frame(otu_table(ps_arch_euxinic_pruned)), data.frame(otu_table(ps_euk_euxinic_pruned)))
alldomains_df_euxinic
Change row names from “denovoXXX” to meaningful names
alldomains_df_full_euxinic <- cbind(ID = rownames(alldomains_df_euxinic), alldomains_df_euxinic)
# start with only first rows, which are bacteria. make one column of meaningful labels
temp1 <- left_join(alldomains_df_full_euxinic[1:dim(otu_table(ps_bac_euxinic_pruned))[1],], bac_taxonomy, by = c("ID" = "#OTU ID"))
temp1$New_ID <- paste(temp1$ID, temp1$"taxonomy-2", temp1$"taxonomy-3", temp1$"taxonomy-4")
temp1 <- select(temp1,-colnames(bac_taxonomy[,2:11]))
# next rows are the archaea
temp2 <- left_join(alldomains_df_full_euxinic[sum(dim(otu_table(ps_bac_euxinic_pruned))[1],1):sum(dim(otu_table(ps_bac_euxinic_pruned))[1],dim(otu_table(ps_arch_euxinic_pruned))[1]),], arch_taxonomy, by = c("ID" = "#OTU ID"))
temp2$New_ID <- paste(temp2$ID, temp2$"taxonomy-2", temp2$"taxonomy-3")
temp2 <- select(temp2,-colnames(arch_taxonomy[,2:9]))
# last rows are eukarya
euk_taxonomy <- cbind("#ASV ID" = rownames(taxonomy), taxonomy)
temp3 <- left_join(alldomains_df_full_euxinic[sum(dim(otu_table(ps_arch_euxinic_pruned))[1], dim(otu_table(ps_bac_euxinic_pruned))[1],1):sum(dim(otu_table(ps_arch_euxinic_pruned))[1], dim(otu_table(ps_bac_euxinic_pruned))[1],dim(otu_table(ps_euk_euxinic_pruned))[1]),], euk_taxonomy, by = c("ID" = "#ASV ID"))
temp3$New_ID <- paste(temp3$ID, temp3$"Supergroup", temp3$"Division", temp3$"Class", temp3$"Order")
temp3 <- select(temp3,-colnames(euk_taxonomy[,2:9]))
# combine back all 3 domains, with new names as row names in a dataframe
alldomains_df_full_euxinic <- rbind(temp1, temp2, temp3)
alldomains_df_full_euxinic <- data.frame(alldomains_df_full_euxinic)
rownames(alldomains_df_full_euxinic) <- alldomains_df_full_euxinic$New_ID
alldomains_df_full_euxinic <- select(alldomains_df_full_euxinic, -c("ID","New_ID"))
alldomains_df_full_euxinic
Remove columns with NAs. These are samples for which the library for at least one domain didn’t work (can’t do correlations with missing values in columns)
alldomains_df_full_euxinic <- alldomains_df_full_euxinic %>%
select_if(~ !any(is.na(.)))
alldomains_df_full_euxinic
alldomains_df_euxinic <- alldomains_df_euxinic %>%
select_if(~ !any(is.na(.)))
alldomains_df_euxinic
4 samples remain for correlation
This is largely based on BVCN tutorials NOTE- input for SparCC should be raw count data (after filtering out low-abundance ASVs). The function does a log-ratio transformation to account for compositionality
sparcctable_alldomains <- sparcc(t(alldomains_df))
Put sample names back into result tables
rownames(sparcctable_alldomains$Cor) <- rownames(alldomains_df_full)
colnames(sparcctable_alldomains$Cor) <- rownames(alldomains_df_full)
rownames(sparcctable_alldomains$Cov) <- rownames(alldomains_df_full)
colnames(sparcctable_alldomains$Cov) <- rownames(alldomains_df_full)
sparcctable_alldomains$Cor[1:2,1:2]
Plot correlation
plotableSparcc <- sparcctable_alldomains$Cor %>% reorder_cormat %>% get_upper_tri() %>% reshape2::melt() %>% na.omit()
Sparcc_plot <- plotableSparcc %>% ggplot(aes(x = Var2, y = Var1, fill = value)) + geom_tile() + scale_fill_gradient2() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
Sparcc_plot
# ggsave("figures/sparcc_corr_alldomains.eps",Sparcc_plot, width = 35, height = 35, units = c("in"))
Calculate Sparcc p-values by bootstrapping- TAKES A LONG TIME
# tp0 <- proc.time()
# out2 <- sparccboot(t(alldomains_df), R = 1000, ncpus = 2)
# tp1 <- proc.time()
# tp1 - tp0
The above took ~14 hours to run 1000 iterations
Extract p-values
outP <- pval.sparccboot(out2)
data.frame(outP$cors, outP$pvals) %>% head
cors <- outP$cors
pvals <- outP$pvals
sparCCpcors <- diag(0.5, nrow = dim(sparcctable_alldomains$Cor)[1], ncol = dim(sparcctable_alldomains$Cor)[1])
sparCCpcors[upper.tri(sparCCpcors, diag=FALSE)] <- cors
sparCCpcors <- sparCCpcors + t(sparCCpcors)
sparCCpval <- diag(0.5, nrow = dim(sparcctable_alldomains$Cor)[1], ncol = dim(sparcctable_alldomains$Cor)[1])
sparCCpval[upper.tri(sparCCpval, diag=FALSE)] <- pvals
sparCCpval <- sparCCpval + t(sparCCpval)
rownames(sparCCpcors) <- rownames(alldomains_df_full)
colnames(sparCCpcors) <- rownames(alldomains_df_full)
rownames(sparCCpval) <- rownames(alldomains_df_full)
colnames(sparCCpval) <- rownames(alldomains_df_full)
sparCCpcors[1:2, 1:2]
sparCCpval[1:2, 1:2]
Reorder for plotting
reordered_all_sparcc <- reorder_cor_and_p(sparCCpcors, sparCCpval)
reordered_sparccCor <- reordered_all_sparcc$r
reordered_sparccP<- reordered_all_sparcc$p
sparccCor_processed <- reordered_sparccCor %>% get_upper_tri() %>% reshape2::melt() %>% na.omit() %>% rename(cor = value)
sparccP_processed <- reordered_sparccP %>% get_upper_tri() %>% reshape2::melt() %>% na.omit() %>% rename(p = value)
# join the two data frames
SparccP <- left_join(sparccCor_processed, sparccP_processed, by = c("Var1", "Var2")) %>%
# # remove self correlations
# filter(Var1 != Var2) %>%
# calculate the false discovery rate to adjust for multiple p values
mutate(fdr = p.adjust(p, method = "BH"))
And plot correlation with p-values. Circles mean that the relationship is sig. at p = 0.05 level, based on bootstrapping
fdrThresh <- 0.01 # fdr threshold
sparccOkP <- SparccP%>% filter(fdr < fdrThresh)
SparccP_plot <- SparccP %>% ggplot(aes(x = Var2, y = Var1, fill = cor)) + geom_tile() + scale_fill_gradient2() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + geom_point(data = sparccOkP, shape = 1)
SparccP_plot
ggsave("figures/sparcc_corr_alldomains_w_pvals.eps",SparccP_plot, width = 35, height = 35, units = c("in"))
Save environment again
# save.image("EnvironmentBackups/CariacoEuks_postanalysis_vars_upto_sparcc_bootstrap.RData")
Or load if coming back
load("EnvironmentBackups/CariacoEuks_postanalysis_vars_upto_sparcc_bootstrap.RData")
Try the SpiecEasi method, which accounts for sparse data, as described in the SpiecEasi publication, spieceasi github, and BVCN lessons 1.2. This reduces the clumps (eg. sparse relationships that are secondary or teriary, not direct relationships).
Make functions from tutorial
convertSEToTable <- function(se_out,sp.names){
#This is just a fancy helper function to get the data in a comparable format to the output of lesson 1 so we can make a similar plot. We will cover other methods for visualizing this type of output in future lessons.
secor <- cov2cor(as.matrix(getOptCov(se_out))) # See spieceasi documentation for how to pull out weights for comparison
elist <- summary(triu(secor*getRefit(se_out), k=1))
elist[,1] <- sp.names[elist[,1]]
elist[,2] <- sp.names[elist[,2]]
elist[,4] <- paste(elist[,1],elist[,2])
full_e <- expand.grid(sp.names,sp.names)
rownames(full_e) <- paste(full_e[,1],full_e[,2])
full_e[,"Weight"] <- 0
full_e[elist[,4],"Weight"] <- elist[,3]
x <- expand.grid(1:length(sp.names),1:length(sp.names))
full_e[x[,"Var1"]>x[,"Var2"],"Weight"] <- NA
return(as.data.frame(full_e,stringsAsFactors=F))
}
Follow the spieceasi documentation to find optimal parameters. Also, because I want to compare networks, this convo on using optimal parameters for different network comparisons is helpful.
Remove samples from the phyloseq objects that are not in all 3 domains and reorder samples so they are in same order in all 3 objects
bac_arch_common <- intersect(sample_names(ps_bac_ra_pruned), sample_names(ps_arch_ra_pruned))
all_common <- intersect(bac_arch_common, sample_names(ps_euk_ra_pruned))
ps_bac_pruned_3domains <- prune_samples(all_common, ps_bac_pruned)
ps_arch_pruned_3domains <- prune_samples(all_common, ps_arch_pruned)
ps_euk_pruned_3domains <- prune_samples(all_common, ps_euk_pruned)
ps_bac_ra_pruned_3domains <- prune_samples(all_common, ps_bac_ra_pruned)
ps_arch_ra_pruned_3domains <- prune_samples(all_common, ps_arch_ra_pruned)
ps_euk_ra_pruned_3domains <- prune_samples(all_common, ps_euk_ra_pruned)
otu_table(ps_arch_pruned_3domains) <- otu_table(ps_arch_pruned_3domains)[,sample_names(ps_bac_ra_pruned_3domains)]
otu_table(ps_euk_pruned_3domains) <- otu_table(ps_euk_pruned_3domains)[,sample_names(ps_bac_ra_pruned_3domains)]
sample_data(ps_bac_pruned_3domains)
sample_data(ps_arch_pruned_3domains)
sample_data(ps_euk_pruned_3domains)
#Run Spieceasi
pargs <- list(seed=10010)
se <- spiec.easi(list(ps_bac_pruned_3domains, ps_arch_pruned_3domains, ps_euk_pruned_3domains), method='glasso', lambda.min.ratio=1e-2, nlambda=100, pulsar.params=pargs)
getStability(se)
the above takes a while to run (20-30 mins). Using parameters above, the stability along the lambda path crosses the 0.05 threshold and the final stability value (0.044) is sufficiently close to 0.05
#This is just a fancy helper function to get the data in a comparable format to the output of above
tab.se <- convertSEToTable(se,sp.names=colnames(t(alldomains_df_full)))
#Plot
plot.se <- ggplot(tab.se,aes(x = Var1, y = Var2, fill = Weight)) + geom_tile() + scale_fill_gradient2() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(plot.se)
ggsave("figures/spieceasi_alldomains.eps",plot.se, width = 35, height = 35, units = c("in"))
Note- only the significant values above show up in the heatmap above (ie. there is no “p-value”)
Remove samples from the phyloseq objects that are not in both domains and reorder samples so they are in same order in all 3 objects
bac_arch_common <- intersect(sample_names(ps_bac_ra_pruned), sample_names(ps_arch_ra_pruned))
ps_bac_pruned_2domains <- prune_samples(bac_arch_common, ps_bac_pruned)
ps_arch_pruned_2domains <- prune_samples(bac_arch_common, ps_arch_pruned)
ps_bac_ra_pruned_2domains <- prune_samples(bac_arch_common, ps_bac_ra_pruned)
ps_arch_ra_pruned_2domains <- prune_samples(bac_arch_common, ps_arch_ra_pruned)
otu_table(ps_arch_pruned_2domains) <- otu_table(ps_arch_pruned_2domains)[,sample_names(ps_bac_ra_pruned_3domains)]
sample_data(ps_bac_pruned_2domains)
sample_data(ps_arch_pruned_2domains)
#Run Spieceasi
pargs <- list(seed=10010)
se.2domains <- spiec.easi(list(ps_bac_pruned_2domains, ps_arch_pruned_2domains), method='glasso', lambda.min.ratio=1e-2, nlambda=200, pulsar.params=pargs)
getStability(se.2domains)
the above takes a while to run . Using parameters above, the stability along the lambda path crosses the 0.05 threshold and the final stability value (0.046) is close to 0.05
#This is just a fancy helper function to get the data in a comparable format to the output of above
tab.se <- convertSEToTable(se.2domains,sp.names=colnames(t(twodomains_df_full)))
#Plot
plot.se <- ggplot(tab.se,aes(x = Var1, y = Var2, fill = Weight)) + geom_tile() + scale_fill_gradient2() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(plot.se)
ggsave("figures/spieceasi_2domains.eps",plot.se, width = 35, height = 35, units = c("in"))
Note- only the significant values above show up in the heatmap above (ie. there is no “p-value”)
bac_arch_common <- intersect(sample_names(ps_bac_oxycline_pruned), sample_names(ps_arch_oxycline_pruned))
all_common <- intersect(bac_arch_common, sample_names(ps_euk_oxycline_pruned))
ps_bac_oxycline_pruned <- prune_samples(all_common, ps_bac_oxycline_pruned)
ps_arch_oxycline_pruned <- prune_samples(all_common, ps_arch_oxycline_pruned)
ps_euk_oxycline_pruned <- prune_samples(all_common, ps_euk_oxycline_pruned)
otu_table(ps_arch_oxycline_pruned) <- otu_table(ps_arch_oxycline_pruned)[,sample_names(ps_bac_oxycline_pruned)]
otu_table(ps_euk_oxycline_pruned) <- otu_table(ps_euk_oxycline_pruned)[,sample_names(ps_bac_oxycline_pruned)]
sample_data(ps_bac_oxycline_pruned)
sample_data(ps_arch_oxycline_pruned)
sample_data(ps_euk_oxycline_pruned)
#Run Spieceasi
pargs <- list(seed=10010)
se.oxycline <- spiec.easi(list(ps_bac_oxycline_pruned, ps_arch_oxycline_pruned, ps_euk_oxycline_pruned), method='glasso', lambda.min.ratio=5e-3, nlambda=300, pulsar.params=pargs)
getStability(se.oxycline)
the above takes a couple of minutes to run. Stability and stability along lambda path are very similar to the full dataset spieceasi object (se) with these parameters above. Continue with these.
#This is just a fancy helper function to get the data in a comparable format to the output of above
tab.se.oxycline <- convertSEToTable(se.oxycline, sp.names=colnames(t(alldomains_df_full_oxycline)))
#Plot
plot.se.oxycline <- ggplot(tab.se.oxycline,aes(x = Var1, y = Var2, fill = Weight)) + geom_tile() + scale_fill_gradient2() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(plot.se.oxycline)
ggsave("figures/spieceasi_alldomains_oxycline.eps",plot.se.oxycline, width = 35, height = 35, units = c("in"))
bac_arch_common <- intersect(sample_names(ps_bac_anoxic_pruned), sample_names(ps_arch_anoxic_pruned))
all_common <- intersect(bac_arch_common, sample_names(ps_euk_anoxic_pruned))
ps_bac_anoxic_pruned <- prune_samples(all_common, ps_bac_anoxic_pruned)
ps_arch_anoxic_pruned <- prune_samples(all_common, ps_arch_anoxic_pruned)
ps_euk_anoxic_pruned <- prune_samples(all_common, ps_euk_anoxic_pruned)
otu_table(ps_arch_anoxic_pruned) <- otu_table(ps_arch_anoxic_pruned)[,sample_names(ps_bac_anoxic_pruned)]
otu_table(ps_euk_anoxic_pruned) <- otu_table(ps_euk_anoxic_pruned)[,sample_names(ps_bac_anoxic_pruned)]
sample_data(ps_bac_anoxic_pruned)
sample_data(ps_arch_anoxic_pruned)
sample_data(ps_euk_anoxic_pruned)
#Run Spieceasi
pargs <- list(seed=10010)
se.anoxic <- spiec.easi(list(ps_bac_anoxic_pruned, ps_arch_anoxic_pruned, ps_euk_anoxic_pruned), method='glasso', lambda.min.ratio=1e-1, nlambda=300, pulsar.params=pargs)
getStability(se.anoxic)
the above takes a couple of minutes to run
#This is just a fancy helper function to get the data in a comparable format to the output of above
tab.se.anoxic <- convertSEToTable(se.anoxic, sp.names=colnames(t(alldomains_df_full_anoxic)))
#Plot
plot.se.anoxic <- ggplot(tab.se.anoxic,aes(x = Var1, y = Var2, fill = Weight)) + geom_tile() + scale_fill_gradient2() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot(plot.se.anoxic)
ggsave("figures/spieceasi_alldomains_anoxic.eps",plot.se.anoxic, width = 35, height = 35, units = c("in"))
bac_arch_common <- intersect(sample_names(ps_bac_euxinic_pruned), sample_names(ps_arch_euxinic_pruned))
all_common <- intersect(bac_arch_common, sample_names(ps_euk_euxinic_pruned))
ps_bac_euxinic_pruned <- prune_samples(all_common, ps_bac_euxinic_pruned)
ps_arch_euxinic_pruned <- prune_samples(all_common, ps_arch_euxinic_pruned)
ps_euk_euxinic_pruned <- prune_samples(all_common, ps_euk_euxinic_pruned)
otu_table(ps_arch_euxinic_pruned) <- otu_table(ps_arch_euxinic_pruned)[,sample_names(ps_bac_euxinic_pruned)]
otu_table(ps_euk_euxinic_pruned) <- otu_table(ps_euk_euxinic_pruned)[,sample_names(ps_bac_euxinic_pruned)]
sample_data(ps_bac_euxinic_pruned)
sample_data(ps_arch_euxinic_pruned)
sample_data(ps_euk_euxinic_pruned)
#Run Spieceasi
pargs <- list(seed=10010)
se.euxinic <- spiec.easi(list(ps_bac_euxinic_pruned, ps_arch_euxinic_pruned, ps_euk_euxinic_pruned), method='glasso', lambda.min.ratio=1e-5,nlambda=20, pulsar.params=pargs)
getStability(se.euxinic)
I tried many parameters on the above but cannot get a satisfactory solution. There are just too few samples after quality filtering to do SpiecEasi on the euxinic depths only.
save.image("EnvironmentBackups/CariacoEuks_postanalysis_vars_upto_spieceasi.RData")
Or load if coming back
load("EnvironmentBackups/CariacoEuks_postanalysis_vars_upto_spieceasi.RData")
Build networks from the SpiecEasi association matrices using iGraph
#Extract adjacency matrix from spiecEasi output
adj.mat <- getRefit(se)
table(as.numeric(adj.mat))
# Extract weighted adjacency
se.cor <- cov2cor(as.matrix(getOptCov(se)))
weighted.adj.mat <- se.cor*getRefit(se)
#Convert to graph objects
grph.unweighted <- adj2igraph(adj.mat)
grph <- adj2igraph(weighted.adj.mat)
# Put back in species names
V(grph)$name <- rownames(alldomains_df)
# V(grph)
# Make size of nodes proportional to degree (number of connections)
V(grph)$size <- (degree(grph) + 1) # the +1 avoids size zero vertices
# Color edges by connection (positive or negative)
# E(grph)$color <- custombluegreen
# E(grph)$color[E(grph)$weight<0] <- customreddishpurple
# Change width of edges to be proportional to their weights
E(grph)$width <- abs(E(grph)$weight)*10
# Scale node sizes to be smaller
V(grph)$size <- V(grph)$size/2
# Remove low-weight edges (you decide what threshold is right for your network):
# weight_threshold <- 0.07
# grph <- delete.edges(grph,which(abs(E(grph)$weight)<weight_threshold))
# Color nodes by domain
dtype <- c(rep("red",ntaxa(ps_bac_pruned_3domains)), rep("green",ntaxa(ps_arch_pruned_3domains)), rep("blue",ntaxa(ps_euk_pruned_3domains)))
# Plot
plot(grph,
vertex.label=NA,
layout=layout_with_graphopt(grph),
vertex.color=dtype)
title("SpiecEasi Network: All domains, Whole Water Column")
legend("topright",bty = "n",
legend=c("Bacteria","Archaea", "Eukarya"),
fill=c("red","green","blue"), border=NA)
# Save plot
setEPS()
postscript(file = "Figures/3domains_alldepths_spieceasi_network.eps", width = 5.5, height = 5)
plot(grph,
vertex.label=NA,
layout=layout_with_graphopt(grph),
vertex.color=dtype)
title("SpiecEasi Network: All domains, Whole Water Column")
legend("topright",bty = "n",
legend=c("Bacteria","Archaea", "Eukarya"),
fill=c("red","green","blue"), border=NA)
dev.off()
#Extract adjacency matrix from spiecEasi output
adj.mat <- getRefit(se.2domains)
table(as.numeric(adj.mat))
# Extract weighted adjacency
se.cor <- cov2cor(as.matrix(getOptCov(se.2domains)))
weighted.adj.mat <- se.cor*getRefit(se.2domains)
#Convert to graph objects
grph.unweighted <- adj2igraph(adj.mat)
grph.2domains <- adj2igraph(weighted.adj.mat)
# Put back in species names
V(grph.2domains)$name <- rownames(twodomains_df)
# V(grph.2domains)
# Make size of nodes proportional to degree (number of connections)
V(grph.2domains)$size <- (degree(grph.2domains) + 1) # the +1 avoids size zero vertices
# Color edges by connection (positive or negative)
# E(grph.2domains)$color <- custombluegreen
# E(grph.2domains)$color[E(grph.2domains)$weight<0] <- customreddishpurple
# Change width of edges to be proportional to their weights
E(grph.2domains)$width <- abs(E(grph.2domains)$weight)*10
# Scale node sizes to be smaller
V(grph.2domains)$size <- V(grph.2domains)$size/2
# Remove low-weight edges (you decide what threshold is right for your network):
# weight_threshold <- 0.07
# grph.2domains <- delete.edges(grph.2domains,which(abs(E(grph.2domains)$weight)<weight_threshold))
# Color nodes by domain
dtype <- c(rep("red",ntaxa(ps_bac_pruned_2domains)), rep("green",ntaxa(ps_arch_pruned_2domains)))
# Plot
plot(grph.2domains,
vertex.label=NA,
layout=layout_with_graphopt(grph.2domains),
vertex.color=dtype)
title("SpiecEasi Network: Bacteria and Archaea, Whole Water Column")
legend("topright",bty = "n",
legend=c("Bacteria","Archaea"),
fill=c("red","green"), border=NA)
# Save plot
setEPS()
postscript(file = "Figures/2domains_alldepths_spieceasi_network.eps", width = 5.5, height = 5)
plot(grph.2domains,
vertex.label=NA,
layout=layout_with_graphopt(grph.2domains),
vertex.color=dtype)
title("SpiecEasi Network: Bacteria and Archaea, Whole Water Column")
legend("topright",bty = "n",
legend=c("Bacteria","Archaea"),
fill=c("red","green"), border=NA)
dev.off()
#Extract adjacency matrix from spiecEasi output
adj.mat <- getRefit(se.oxycline)
table(as.numeric(adj.mat))
# Extract weighted adjacency
se.cor <- cov2cor(as.matrix(getOptCov(se.oxycline)))
weighted.adj.mat <- se.cor*getRefit(se.oxycline)
#Convert to graph objects
grph.unweighted.oxycline <- adj2igraph(adj.mat)
grph.oxycline <- adj2igraph(weighted.adj.mat)
# Put back in species names
V(grph.oxycline)$name <- rownames(alldomains_df_oxycline)
# V(grph.oxycline)
# Make size of nodes proportional to degree (number of connections)
V(grph.oxycline)$size <- (degree(grph.oxycline) + 1) # the +1 avoids size zero vertices
# Color edges by connection (positive or negative)
# E(grph.oxycline)$color <- custombluegreen
# E(grph.oxycline)$color[E(grph.oxycline)$weight<0] <- customreddishpurple
# Change width of edges to be proportional to their weights
E(grph.oxycline)$width <- abs(E(grph.oxycline)$weight)*10
# Scale node sizes to be smaller
V(grph.oxycline)$size <- V(grph.oxycline)$size/2
# Remove low-weight edges (you decide what threshold is right for your network):
# weight_threshold <- 0.07
# grph.oxycline <- delete.edges(grph.oxycline,which(abs(E(grph.oxycline)$weight)<weight_threshold))
# Color nodes by domain
dtype <- c(rep("red",ntaxa(ps_bac_oxycline_pruned)), rep("green",ntaxa(ps_arch_oxycline_pruned)), rep("blue",ntaxa(ps_euk_oxycline_pruned)))
# Plot
plot(grph.oxycline,
vertex.label=NA,
layout=layout_with_graphopt(grph.oxycline),
vertex.color=dtype)
title("SpiecEasi Network: All domains, Oxycline")
legend("topright",bty = "n",
legend=c("Bacteria","Archaea", "Eukarya"),
fill=c("red","green","blue"), border=NA)
# Save plot
setEPS()
postscript(file = "Figures/3domains_oxycline_spieceasi_network.eps", width = 5.5, height = 5)
plot(grph.oxycline,
vertex.label=NA,
layout=layout_with_graphopt(grph.oxycline),
vertex.color=dtype)
title("SpiecEasi Network: All domains, Oxycline")
legend("topright",bty = "n",
legend=c("Bacteria","Archaea", "Eukarya"),
fill=c("red","green","blue"), border=NA)
dev.off()
#Extract adjacency matrix from spiecEasi output
adj.mat <- getRefit(se.anoxic)
table(as.numeric(adj.mat))
# Extract weighted adjacency
se.cor <- cov2cor(as.matrix(getOptCov(se.anoxic)))
weighted.adj.mat <- se.cor*getRefit(se.anoxic)
#Convert to graph objects
grph.unweighted.anoxic <- adj2igraph(adj.mat)
grph.anoxic <- adj2igraph(weighted.adj.mat)
# Put back in species names
V(grph.anoxic)$name <- rownames(alldomains_df_anoxic)
# V(grph.anoxic)
# Make size of nodes proportional to degree (number of connections)
V(grph.anoxic)$size <- (degree(grph.anoxic) + 1) # the +1 avoids size zero vertices
# Color edges by connection (positive or negative)
# E(grph.anoxic)$color <- custombluegreen
# E(grph.anoxic)$color[E(grph.anoxic)$weight<0] <- customreddishpurple
# Change width of edges to be proportional to their weights
E(grph.anoxic)$width <- abs(E(grph.anoxic)$weight)*10
# Scale node sizes to be smaller
V(grph.anoxic)$size <- V(grph.anoxic)$size/2
# Remove low-weight edges (you decide what threshold is right for your network):
# weight_threshold <- 0.07
# grph.anoxic <- delete.edges(grph.anoxic,which(abs(E(grph.anoxic)$weight)<weight_threshold))
# Color nodes by domain
dtype <- c(rep("red",ntaxa(ps_bac_anoxic_pruned)), rep("green",ntaxa(ps_arch_anoxic_pruned)), rep("blue",ntaxa(ps_euk_anoxic_pruned)))
# Plot
plot(grph.anoxic,
vertex.label=NA,
layout=layout_with_graphopt(grph.anoxic),
vertex.color=dtype)
title("SpiecEasi Network: All domains, Anoxic Layer")
legend("topright",bty = "n",
legend=c("Bacteria","Archaea", "Eukarya"),
fill=c("red","green","blue"), border=NA)
# Save plot
setEPS()
postscript(file = "Figures/3domains_anoxic_spieceasi_network.eps", width = 5.5, height = 5)
plot(grph.anoxic,
vertex.label=NA,
layout=layout_with_graphopt(grph.anoxic),
vertex.color=dtype)
title("SpiecEasi Network: All domains, Anoxic")
legend("topright",bty = "n",
legend=c("Bacteria","Archaea", "Eukarya"),
fill=c("red","green","blue"), border=NA)
dev.off()
the number of edges and how many are positive vs negative
# total number of edges in full dataset network
length(E(grph)$weight)
# percent of neg edges
(sum(E(grph)$weight<0)/length(E(grph)$weight))*100
# total number of edges in 2-domain dataset network
length(E(grph.2domains)$weight)
# percent of neg edges
(sum(E(grph.2domains)$weight<0)/length(E(grph.2domains)$weight))*100
# total number of edges in oxycline network
length(E(grph.oxycline)$weight)
# percent of neg edges
(sum(E(grph.oxycline)$weight<0)/length(E(grph.oxycline)$weight))*100
# total number of edges in oxycline network
length(E(grph.anoxic)$weight)
# percent of neg edges
(sum(E(grph.anoxic)$weight<0)/length(E(grph.anoxic)$weight))*100
Declining number of total edges going from full dataset –> oxycline only –> anoxic only. But the percentage of negative associations is similar (34.4-37.7%). Most associations (~65%) in each network are positive.
the number of edges relatives to total number of possible edges
edge_density(grph)
edge_density(grph.2domains)
edge_density(grph.oxycline)
edge_density(grph.anoxic)
The full dataset has the highest edge density, then oxycline, then anoxic
The size of the components, or “clumps,” in the network, and how many members in each
# full dataset
components(grph)$no
components(grph)$csize
# 2 domains
components(grph.2domains)$no
components(grph.2domains)$csize
# oxycline
components(grph.oxycline)$no
components(grph.oxycline)$csize
# anoxic
components(grph.anoxic)$no
components(grph.anoxic)$csize
The anoxic network is most disjointed, with 48 clumps and the largest containing only 24 members. The next is oxycline, with 32 clumps and the largest with 144 members. Then the full dataset has only 27 clumps and the largest clump contains 262 members.
Path is the shortest distance between two nodes (fewest number of edges). Average path length of a network gives a sense of how connected every node is to another. Unconnected hubs in the netowrk will have “infinite” paths from other hubs. The function mean_distance ignores the infinite edges and calculates the average of all other edges
mean_distance(grph)
mean_distance(grph.2domains)
mean_distance(grph.oxycline)
mean_distance(grph.anoxic)
The longest average path length is in the oxycline, followed by the whole dataset and then anoxic. Meaning the nodes in the anoxic are more closely associated with each other. Even though there are more hubs in anoxic, as shown above, the nodes in the hubs are close to each other. The oxycline hubs have the longest average distances between nodes.
Calculate 4 parameters for each individual node:
# First change the weights of the edges (the strength of association) to absolute value. This won't work if negative associations are left with negative signs
E(grph)$weight <- abs(E(grph)$weight)
names=V(grph)$name
de=degree(grph)
st=graph.strength(grph)
be=betweenness(grph, normalized=T)
cc = closeness(grph)
l.cluster=transitivity(grph, "local")
# assemble dataset and match full taxonomy
fulldateset_node_measures <- data.frame(ID=names, degree=de, strength=st, betweenness=be, closeness = cc, clustering_coefficient = l.cluster)
# Put back bac taxaonomy
temp1 <- left_join(fulldateset_node_measures[1:dim(otu_table(ps_bac_pruned_3domains))[1],], bac_taxonomy, by = c("ID" = "#OTU ID"))
# delete "Taxonomy-9" and "refined Taxonomy" columns
temp1 <- select(temp1, -"taxonomy-9", -"Refined taxonomy")
temp2 <- left_join(fulldateset_node_measures[sum(dim(otu_table(ps_bac_pruned_3domains))[1],1):sum(dim(otu_table(ps_bac_pruned_3domains))[1],dim(otu_table(ps_arch_pruned_3domains))[1]),], arch_taxonomy, by = c("ID" = "#OTU ID"))
temp3 <- left_join(fulldateset_node_measures[sum(dim(otu_table(ps_arch_pruned_3domains))[1], dim(otu_table(ps_bac_pruned_3domains))[1],1):sum(dim(otu_table(ps_arch_pruned_3domains))[1], dim(otu_table(ps_bac_pruned_3domains))[1],dim(otu_table(ps_euk_pruned_3domains))[1]),], euk_taxonomy, by = c("ID" = "#ASV ID"))
# Rename col names to match those from Bac and Arch
temp3 <- temp3 %>%
rename("taxonomy-1" = Kingdom, "taxonomy-2" = Supergroup, "taxonomy-3" = Division, "taxonomy-4" = Class, "taxonomy-5" = Order, "taxonomy-6" = Family, "taxonomy-7" = Genus, "taxonomy-8" = Species)
# combine back all 3 domains, with new names as row names in a dataframe
fulldateset_node_measures <- rbind(temp1, temp2, temp3)
fulldateset_node_measures
Plot betweeness vs degree for each node. - Tipton et al. argue that nodes with high betweenness are “bottlenecks” or important connectors and nodes with high degree are “hubs” - Berry et al. argue that nodes with low betweenness, high degree, high closeness, and high transitivity are candidate keystone species - Add in closeness into the node’s plotly label since these don’t vary much node-to-node and wouldn’t make sense to plot
# replace NA in taxonomy with unidentified
# remove nodes with 0 betweenness (can't calculate log10 of 0)
# replace NaN clustering coefs with 0
fulldateset_node_measures <- fulldateset_node_measures %>%
replace(is.na(.), "unidentified") %>%
filter(!betweenness == 0)
# get enough colors and randomly rearrange so they are easier to separate on the plot
mycolors <- colorRampPalette(brewer.pal(12, "Paired"))(length(unique(fulldateset_node_measures$`taxonomy-3`)))
mycolors <- sample(mycolors)
# plot with plotly and so I can hover over points and determine which taxa they are
p <- ggplot(fulldateset_node_measures, aes(x = degree, y = betweenness, ID = ID, shape = `taxonomy-1`, `taxonomy-2` = `taxonomy-2`, color = `taxonomy-3`, `taxonomy-4` = `taxonomy-4`, `taxonomy-5` = `taxonomy-5`)) +
geom_point(size = 5) +
scale_y_continuous(trans='log10') +
scale_color_manual(values = mycolors) +
theme_bw()
ggplotly(p, tooltip = c("ID","taxonomy-2", "taxonomy-3", "taxonomy-4", "taxonomy-5"))
Summary
Plot node degree and transitivity (clustering coefficient)
# plot with plotly and so I can hover over points and determine which taxa they are
p <- ggplot(fulldateset_node_measures, aes(x = clustering_coefficient, y = betweenness, ID = ID, shape = `taxonomy-1`, `taxonomy-2` = `taxonomy-2`, color = `taxonomy-3`, `taxonomy-4` = `taxonomy-4`, `taxonomy-5` = `taxonomy-5`)) +
geom_point(size = 5) +
scale_color_manual(values = mycolors) +
scale_y_continuous(trans='log10') +
theme_bw()
ggplotly(p, tooltip = c("ID","taxonomy-2", "taxonomy-3", "taxonomy-4", "taxonomy-5"))
Summary - High betweenness but low clustering coef: - Deferibacteres/ Marine Group A - Some Syndiniales - Gammaproteobacteria: OM182 clade and Xanthomonadales/ Sinobacteraceae clade, and Vibrionaceae clade - Actinobacteria/ Acidimicrobiales/ Sva0966 marine group OTU - High betweenness and high clustering coefficient - Cnidaria, Crustaceae, several Syndinales, Planctomycetes OM190, Euryarchaeota MG II, 1 Spumellarida - Low betweenness, High clusering coefficient - Thaumarchaeota MG I, 1 Spumellarida,1 Dinoflagelata/ Dinophyceae, and the No Blast Hit OTU (not even domain assigned- denovo129797- blast this one)
# First change the weights of the edges (the strength of association) to absolute value. This won't work if negative associations are left with negative signs
E(grph.oxycline)$weight <- abs(E(grph.oxycline)$weight)
names=V(grph.oxycline)$name
de=degree(grph.oxycline)
st=graph.strength(grph.oxycline)
be=betweenness(grph.oxycline, normalized=T)
cc = closeness(grph.oxycline)
l.cluster=transitivity(grph.oxycline, "local")
# assemble dataset and match full taxonomy
fulldateset_node_measures <- data.frame(ID=names, degree=de, strength=st, betweenness=be, closeness = cc, clustering_coefficient = l.cluster)
# Put back bac taxaonomy
temp1 <- left_join(fulldateset_node_measures[1:dim(otu_table(ps_bac_oxycline_pruned))[1],], bac_taxonomy, by = c("ID" = "#OTU ID"))
# delete "Taxonomy-9" and "refined Taxonomy" columns
temp1 <- select(temp1, -"taxonomy-9", -"Refined taxonomy")
temp2 <- left_join(fulldateset_node_measures[sum(dim(otu_table(ps_bac_oxycline_pruned))[1],1):sum(dim(otu_table(ps_bac_oxycline_pruned))[1],dim(otu_table(ps_arch_oxycline_pruned))[1]),], arch_taxonomy, by = c("ID" = "#OTU ID"))
temp3 <- left_join(fulldateset_node_measures[sum(dim(otu_table(ps_arch_oxycline_pruned))[1], dim(otu_table(ps_bac_oxycline_pruned))[1],1):sum(dim(otu_table(ps_arch_oxycline_pruned))[1], dim(otu_table(ps_bac_oxycline_pruned))[1],dim(otu_table(ps_euk_oxycline_pruned))[1]),], euk_taxonomy, by = c("ID" = "#ASV ID"))
# Rename col names to match those from Bac and Arch
temp3 <- temp3 %>%
rename("taxonomy-1" = Kingdom, "taxonomy-2" = Supergroup, "taxonomy-3" = Division, "taxonomy-4" = Class, "taxonomy-5" = Order, "taxonomy-6" = Family, "taxonomy-7" = Genus, "taxonomy-8" = Species)
# combine back all 3 domains, with new names as row names in a dataframe
fulldateset_node_measures <- rbind(temp1, temp2, temp3)
fulldateset_node_measures
Plot betweeness vs degree for each node.
# replace NA in taxonomy with unidentified
# remove nodes with 0 betweenness (can't calculate log10 of 0)
# replace NaN clustering coefs with 0
fulldateset_node_measures <- fulldateset_node_measures %>%
replace(is.na(.), "unidentified") %>%
filter(!betweenness == 0)
# get enough colors and randomly rearrange so they are easier to separate on the plot
mycolors <- colorRampPalette(brewer.pal(12, "Paired"))(length(unique(fulldateset_node_measures$`taxonomy-3`)))
mycolors <- sample(mycolors)
# plot with plotly and so I can hover over points and determine which taxa they are
p <- ggplot(fulldateset_node_measures, aes(x = degree, y = betweenness, ID = ID, shape = `taxonomy-1`, `taxonomy-2` = `taxonomy-2`, color = `taxonomy-3`, `taxonomy-4` = `taxonomy-4`, `taxonomy-5` = `taxonomy-5`)) +
geom_point(size = 5) +
scale_y_continuous(trans='log10') +
scale_color_manual(values = mycolors) +
theme_bw()
ggplotly(p, tooltip = c("ID","taxonomy-2", "taxonomy-3", "taxonomy-4", "taxonomy-5"))
Summary
Plot node degree and transitivity (clustering coefficient)
# plot with plotly and so I can hover over points and determine which taxa they are
p <- ggplot(fulldateset_node_measures, aes(x = clustering_coefficient, y = betweenness, ID = ID, shape = `taxonomy-1`, `taxonomy-2` = `taxonomy-2`, color = `taxonomy-3`, `taxonomy-4` = `taxonomy-4`, `taxonomy-5` = `taxonomy-5`)) +
geom_point(size = 5) +
scale_color_manual(values = mycolors) +
scale_y_continuous(trans='log10') +
theme_bw()
ggplotly(p, tooltip = c("ID","taxonomy-2", "taxonomy-3", "taxonomy-4", "taxonomy-5"))
Summary - …
# First change the weights of the edges (the strength of association) to absolute value. This won't work if negative associations are left with negative signs
E(grph.anoxic)$weight <- abs(E(grph.anoxic)$weight)
names=V(grph.anoxic)$name
de=degree(grph.anoxic)
st=graph.strength(grph.anoxic)
be=betweenness(grph.anoxic, normalized=T)
cc = closeness(grph.anoxic)
l.cluster=transitivity(grph.anoxic, "local")
# assemble dataset and match full taxonomy
fulldateset_node_measures <- data.frame(ID=names, degree=de, strength=st, betweenness=be, closeness = cc, clustering_coefficient = l.cluster)
# Put back bac taxaonomy
temp1 <- left_join(fulldateset_node_measures[1:dim(otu_table(ps_bac_anoxic_pruned))[1],], bac_taxonomy, by = c("ID" = "#OTU ID"))
# delete "Taxonomy-9" and "refined Taxonomy" columns
temp1 <- select(temp1, -"taxonomy-9", -"Refined taxonomy")
temp2 <- left_join(fulldateset_node_measures[sum(dim(otu_table(ps_bac_anoxic_pruned))[1],1):sum(dim(otu_table(ps_bac_anoxic_pruned))[1],dim(otu_table(ps_arch_anoxic_pruned))[1]),], arch_taxonomy, by = c("ID" = "#OTU ID"))
temp3 <- left_join(fulldateset_node_measures[sum(dim(otu_table(ps_arch_anoxic_pruned))[1], dim(otu_table(ps_bac_anoxic_pruned))[1],1):sum(dim(otu_table(ps_arch_anoxic_pruned))[1], dim(otu_table(ps_bac_anoxic_pruned))[1],dim(otu_table(ps_euk_anoxic_pruned))[1]),], euk_taxonomy, by = c("ID" = "#ASV ID"))
# Rename col names to match those from Bac and Arch
temp3 <- temp3 %>%
rename("taxonomy-1" = Kingdom, "taxonomy-2" = Supergroup, "taxonomy-3" = Division, "taxonomy-4" = Class, "taxonomy-5" = Order, "taxonomy-6" = Family, "taxonomy-7" = Genus, "taxonomy-8" = Species)
# combine back all 3 domains, with new names as row names in a dataframe
fulldateset_node_measures <- rbind(temp1, temp2, temp3)
fulldateset_node_measures
Plot betweeness vs degree for each node.
# replace NA in taxonomy with unidentified
# remove nodes with 0 betweenness (can't calculate log10 of 0)
# replace NaN clustering coefs with 0
fulldateset_node_measures <- fulldateset_node_measures %>%
replace(is.na(.), "unidentified") %>%
filter(!betweenness == 0)
# get enough colors and randomly rearrange so they are easier to separate on the plot
mycolors <- colorRampPalette(brewer.pal(12, "Paired"))(length(unique(fulldateset_node_measures$`taxonomy-3`)))
mycolors <- sample(mycolors)
# plot with plotly and so I can hover over points and determine which taxa they are
p <- ggplot(fulldateset_node_measures, aes(x = degree, y = betweenness, ID = ID, shape = `taxonomy-1`, `taxonomy-2` = `taxonomy-2`, color = `taxonomy-3`, `taxonomy-4` = `taxonomy-4`, `taxonomy-5` = `taxonomy-5`)) +
geom_point(size = 5) +
scale_y_continuous(trans='log10') +
scale_color_manual(values = mycolors) +
theme_bw()
ggplotly(p, tooltip = c("ID","taxonomy-2", "taxonomy-3", "taxonomy-4", "taxonomy-5"))
Summary
Plot node degree and transitivity (clustering coefficient)
# plot with plotly and so I can hover over points and determine which taxa they are
p <- ggplot(fulldateset_node_measures, aes(x = clustering_coefficient, y = betweenness, ID = ID, shape = `taxonomy-1`, `taxonomy-2` = `taxonomy-2`, color = `taxonomy-3`, `taxonomy-4` = `taxonomy-4`, `taxonomy-5` = `taxonomy-5`)) +
geom_point(size = 5) +
scale_color_manual(values = mycolors) +
scale_y_continuous(trans='log10') +
theme_bw()
ggplotly(p, tooltip = c("ID","taxonomy-2", "taxonomy-3", "taxonomy-4", "taxonomy-5"))
Summary - …